Intelegentny_Pszczelarz/.venv/Lib/site-packages/sklearn/metrics/tests/test_pairwise_distances_reduction.py

1229 lines
40 KiB
Python
Raw Normal View History

2023-06-19 00:49:18 +02:00
import itertools
import re
import warnings
from collections import defaultdict
import numpy as np
import pytest
import threadpoolctl
from math import log10, floor
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cdist
from sklearn.metrics._pairwise_distances_reduction import (
BaseDistancesReductionDispatcher,
ArgKmin,
RadiusNeighbors,
sqeuclidean_row_norms,
)
from sklearn.metrics import euclidean_distances
from sklearn.utils.fixes import sp_version, parse_version
from sklearn.utils._testing import (
assert_array_equal,
assert_allclose,
create_memmap_backed_data,
)
# Common supported metric between scipy.spatial.distance.cdist
# and BaseDistanceReductionDispatcher.
# This allows constructing tests to check consistency of results
# of concrete BaseDistanceReductionDispatcher on some metrics using APIs
# from scipy and numpy.
CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS = [
"braycurtis",
"canberra",
"chebyshev",
"cityblock",
"euclidean",
"minkowski",
"seuclidean",
]
def _get_metric_params_list(metric: str, n_features: int, seed: int = 1):
"""Return list of dummy DistanceMetric kwargs for tests."""
# Distinguishing on cases not to compute unneeded datastructures.
rng = np.random.RandomState(seed)
if metric == "minkowski":
minkowski_kwargs = [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)]
if sp_version >= parse_version("1.8.0.dev0"):
# TODO: remove the test once we no longer support scipy < 1.8.0.
# Recent scipy versions accept weights in the Minkowski metric directly:
# type: ignore
minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
return minkowski_kwargs
# TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0.
if metric == "wminkowski":
weights = rng.random_sample(n_features)
weights /= weights.sum()
wminkowski_kwargs = [dict(p=1.5, w=weights)]
if sp_version < parse_version("1.8.0.dev0"):
# wminkowski was removed in scipy 1.8.0 but should work for previous
# versions.
wminkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
return wminkowski_kwargs
if metric == "seuclidean":
return [dict(V=rng.rand(n_features))]
# Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric.
# In those cases, no kwargs is needed.
return [{}]
def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1e-7):
assert_array_equal(
ref_indices,
indices,
err_msg="Query vectors have different neighbors' indices",
)
assert_allclose(
ref_dist,
dist,
err_msg="Query vectors have different neighbors' distances",
rtol=rtol,
)
def relative_rounding(scalar, n_significant_digits):
"""Round a scalar to a number of significant digits relatively to its value."""
if scalar == 0:
return 0.0
magnitude = int(floor(log10(abs(scalar)))) + 1
return round(scalar, n_significant_digits - magnitude)
def test_relative_rounding():
assert relative_rounding(0, 1) == 0.0
assert relative_rounding(0, 10) == 0.0
assert relative_rounding(0, 123456) == 0.0
assert relative_rounding(123456789, 0) == 0
assert relative_rounding(123456789, 2) == 120000000
assert relative_rounding(123456789, 3) == 123000000
assert relative_rounding(123456789, 10) == 123456789
assert relative_rounding(123456789, 20) == 123456789
assert relative_rounding(1.23456789, 2) == 1.2
assert relative_rounding(1.23456789, 3) == 1.23
assert relative_rounding(1.23456789, 10) == 1.23456789
assert relative_rounding(123.456789, 3) == 123.0
assert relative_rounding(123.456789, 9) == 123.456789
assert relative_rounding(123.456789, 10) == 123.456789
def assert_argkmin_results_quasi_equality(
ref_dist,
dist,
ref_indices,
indices,
rtol=1e-4,
):
"""Assert that argkmin results are valid up to:
- relative tolerance on computed distance values
- permutations of indices for distances values that differ up to
a precision level
To be used for testing neighbors queries on float32 datasets: we
accept neighbors rank swaps only if they are caused by small
rounding errors on the distance computations.
"""
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)
assert (
ref_dist.shape == dist.shape == ref_indices.shape == indices.shape
), "Arrays of results have various shapes."
n_queries, n_neighbors = ref_dist.shape
# Asserting equality results one row at a time
for query_idx in range(n_queries):
ref_dist_row = ref_dist[query_idx]
dist_row = dist[query_idx]
assert is_sorted(
ref_dist_row
), f"Reference distances aren't sorted on row {query_idx}"
assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"
assert_allclose(ref_dist_row, dist_row, rtol=rtol)
ref_indices_row = ref_indices[query_idx]
indices_row = indices[query_idx]
# Grouping indices by distances using sets on a rounded distances up
# to a given number of decimals of significant digits derived from rtol.
reference_neighbors_groups = defaultdict(set)
effective_neighbors_groups = defaultdict(set)
for neighbor_rank in range(n_neighbors):
rounded_dist = relative_rounding(
ref_dist_row[neighbor_rank],
n_significant_digits=n_significant_digits,
)
reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
# Asserting equality of groups (sets) for each distance
msg = (
f"Neighbors indices for query {query_idx} are not matching "
f"when rounding distances at {n_significant_digits} significant digits "
f"derived from rtol={rtol:.1e}"
)
for rounded_distance in reference_neighbors_groups.keys():
assert (
reference_neighbors_groups[rounded_distance]
== effective_neighbors_groups[rounded_distance]
), msg
def assert_radius_neighbors_results_equality(
ref_dist, dist, ref_indices, indices, radius
):
# We get arrays of arrays and we need to check for individual pairs
for i in range(ref_dist.shape[0]):
assert (ref_dist[i] <= radius).all()
assert_array_equal(
ref_indices[i],
indices[i],
err_msg=f"Query vector #{i} has different neighbors' indices",
)
assert_allclose(
ref_dist[i],
dist[i],
err_msg=f"Query vector #{i} has different neighbors' distances",
rtol=1e-7,
)
def assert_radius_neighbors_results_quasi_equality(
ref_dist,
dist,
ref_indices,
indices,
radius,
rtol=1e-4,
):
"""Assert that radius neighborhood results are valid up to:
- relative tolerance on computed distance values
- permutations of indices for distances values that differ up to
a precision level
- missing or extra last elements if their distance is
close to the radius
To be used for testing neighbors queries on float32 datasets: we
accept neighbors rank swaps only if they are caused by small
rounding errors on the distance computations.
Input arrays must be sorted w.r.t distances.
"""
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)
assert (
len(ref_dist) == len(dist) == len(ref_indices) == len(indices)
), "Arrays of results have various lengths."
n_queries = len(ref_dist)
# Asserting equality of results one vector at a time
for query_idx in range(n_queries):
ref_dist_row = ref_dist[query_idx]
dist_row = dist[query_idx]
assert is_sorted(
ref_dist_row
), f"Reference distances aren't sorted on row {query_idx}"
assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"
# Vectors' lengths might be different due to small
# numerical differences of distance w.r.t the `radius` threshold.
largest_row = ref_dist_row if len(ref_dist_row) > len(dist_row) else dist_row
# For the longest distances vector, we check that last extra elements
# that aren't present in the other vector are all in: [radius ± rtol]
min_length = min(len(ref_dist_row), len(dist_row))
last_extra_elements = largest_row[min_length:]
if last_extra_elements.size > 0:
assert np.all(radius - rtol <= last_extra_elements <= radius + rtol), (
f"The last extra elements ({last_extra_elements}) aren't in [radius ±"
f" rtol]=[{radius} ± {rtol}]"
)
# We truncate the neighbors results list on the smallest length to
# be able to compare them, ignoring the elements checked above.
ref_dist_row = ref_dist_row[:min_length]
dist_row = dist_row[:min_length]
assert_allclose(ref_dist_row, dist_row, rtol=rtol)
ref_indices_row = ref_indices[query_idx]
indices_row = indices[query_idx]
# Grouping indices by distances using sets on a rounded distances up
# to a given number of significant digits derived from rtol.
reference_neighbors_groups = defaultdict(set)
effective_neighbors_groups = defaultdict(set)
for neighbor_rank in range(min_length):
rounded_dist = relative_rounding(
ref_dist_row[neighbor_rank],
n_significant_digits=n_significant_digits,
)
reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
# Asserting equality of groups (sets) for each distance
msg = (
f"Neighbors indices for query {query_idx} are not matching "
f"when rounding distances at {n_significant_digits} significant digits "
f"derived from rtol={rtol:.1e}"
)
for rounded_distance in reference_neighbors_groups.keys():
assert (
reference_neighbors_groups[rounded_distance]
== effective_neighbors_groups[rounded_distance]
), msg
ASSERT_RESULT = {
# In the case of 64bit, we test for exact equality of the results rankings
# and standard tolerance levels for the computed distance values.
#
# XXX: Note that in the future we might be interested in using quasi equality
# checks also for float64 data (with a larger number of significant digits)
# as the tests could be unstable because of numerically tied distances on
# some datasets (e.g. uniform grids).
(ArgKmin, np.float64): assert_argkmin_results_equality,
(
RadiusNeighbors,
np.float64,
): assert_radius_neighbors_results_equality,
# In the case of 32bit, indices can be permuted due to small difference
# in the computations of their associated distances, hence we test equality of
# results up to valid permutations.
(ArgKmin, np.float32): assert_argkmin_results_quasi_equality,
(
RadiusNeighbors,
np.float32,
): assert_radius_neighbors_results_quasi_equality,
}
def test_assert_argkmin_results_quasi_equality():
rtol = 1e-7
eps = 1e-7
_1m = 1.0 - eps
_1p = 1.0 + eps
_6_1m = 6.1 - eps
_6_1p = 6.1 + eps
ref_dist = np.array(
[
[1.2, 2.5, _6_1m, 6.1, _6_1p],
[_1m, _1m, 1, _1p, _1p],
]
)
ref_indices = np.array(
[
[1, 2, 3, 4, 5],
[6, 7, 8, 9, 10],
]
)
# Sanity check: compare the reference results to themselves.
assert_argkmin_results_quasi_equality(
ref_dist, ref_dist, ref_indices, ref_indices, rtol
)
# Apply valid permutation on indices: the last 3 points are
# all very close to one another so we accept any permutation
# on their rankings.
assert_argkmin_results_quasi_equality(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[1, 2, 4, 5, 3]]),
rtol=rtol,
)
# All points are have close distances so any ranking permutation
# is valid for this query result.
assert_argkmin_results_quasi_equality(
np.array([[_1m, _1m, 1, _1p, _1p]]),
np.array([[_1m, _1m, 1, _1p, _1p]]),
np.array([[6, 7, 8, 9, 10]]),
np.array([[6, 9, 7, 8, 10]]),
rtol=rtol,
)
# Apply invalid permutation on indices: permuting the ranks
# of the 2 nearest neighbors is invalid because the distance
# values are too different.
msg = "Neighbors indices for query 0 are not matching"
with pytest.raises(AssertionError, match=msg):
assert_argkmin_results_quasi_equality(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[2, 1, 3, 4, 5]]),
rtol=rtol,
)
# Indices aren't properly sorted w.r.t their distances
msg = "Neighbors indices for query 0 are not matching"
with pytest.raises(AssertionError, match=msg):
assert_argkmin_results_quasi_equality(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[2, 1, 4, 5, 3]]),
rtol=rtol,
)
# Distances aren't properly sorted
msg = "Distances aren't sorted on row 0"
with pytest.raises(AssertionError, match=msg):
assert_argkmin_results_quasi_equality(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[2, 1, 4, 5, 3]]),
rtol=rtol,
)
def test_assert_radius_neighbors_results_quasi_equality():
rtol = 1e-7
eps = 1e-7
_1m = 1.0 - eps
_1p = 1.0 + eps
_6_1m = 6.1 - eps
_6_1p = 6.1 + eps
ref_dist = [
np.array([1.2, 2.5, _6_1m, 6.1, _6_1p]),
np.array([_1m, 1, _1p, _1p]),
]
ref_indices = [
np.array([1, 2, 3, 4, 5]),
np.array([6, 7, 8, 9]),
]
# Sanity check: compare the reference results to themselves.
assert_radius_neighbors_results_quasi_equality(
ref_dist,
ref_dist,
ref_indices,
ref_indices,
radius=6.1,
rtol=rtol,
)
# Apply valid permutation on indices
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([1, 2, 4, 5, 3])]),
radius=6.1,
rtol=rtol,
)
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
np.array([np.array([6, 7, 8, 9, 10])]),
np.array([np.array([6, 9, 7, 8, 10])]),
radius=6.1,
rtol=rtol,
)
# Apply invalid permutation on indices
msg = "Neighbors indices for query 0 are not matching"
with pytest.raises(AssertionError, match=msg):
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 3, 4, 5])]),
radius=6.1,
rtol=rtol,
)
# Having extra last elements is valid if they are in: [radius ± rtol]
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([1, 2, 3, 4])]),
radius=6.1,
rtol=rtol,
)
# Having extra last elements is invalid if they are lesser than radius - rtol
msg = re.escape(
"The last extra elements ([6.]) aren't in [radius ± rtol]=[6.1 ± 1e-07]"
)
with pytest.raises(AssertionError, match=msg):
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, 6])]),
np.array([np.array([1.2, 2.5])]),
np.array([np.array([1, 2, 3])]),
np.array([np.array([1, 2])]),
radius=6.1,
rtol=rtol,
)
# Indices aren't properly sorted w.r.t their distances
msg = "Neighbors indices for query 0 are not matching"
with pytest.raises(AssertionError, match=msg):
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 4, 5, 3])]),
radius=6.1,
rtol=rtol,
)
# Distances aren't properly sorted
msg = "Distances aren't sorted on row 0"
with pytest.raises(AssertionError, match=msg):
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 4, 5, 3])]),
radius=6.1,
rtol=rtol,
)
def test_pairwise_distances_reduction_is_usable_for():
rng = np.random.RandomState(0)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
X_csr = csr_matrix(X)
Y_csr = csr_matrix(Y)
metric = "manhattan"
# Must be usable for all possible pair of {dense, sparse} datasets
assert BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y_csr, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(X, Y_csr, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.float64), Y.astype(np.float64), metric
)
assert BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.float32), Y.astype(np.float32), metric
)
assert not BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.int64), Y.astype(np.int64), metric
)
assert not BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric="pyfunc")
assert not BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.float32), Y, metric
)
assert not BaseDistancesReductionDispatcher.is_usable_for(
X, Y.astype(np.int32), metric
)
# F-ordered arrays are not supported
assert not BaseDistancesReductionDispatcher.is_usable_for(
np.asfortranarray(X), Y, metric
)
# We prefer not to use those implementations for fused sparse-dense when
# metric="(sq)euclidean" because it's not yet the most efficient one on
# all configurations of datasets.
# See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa
# TODO: implement specialisation for (sq)euclidean on fused sparse-dense
# using sparse-dense routines for matrix-vector multiplications.
assert not BaseDistancesReductionDispatcher.is_usable_for(
X_csr, Y, metric="euclidean"
)
assert BaseDistancesReductionDispatcher.is_usable_for(
X_csr, Y_csr, metric="sqeuclidean"
)
assert BaseDistancesReductionDispatcher.is_usable_for(
X_csr, Y_csr, metric="euclidean"
)
# CSR matrices without non-zeros elements aren't currently supported
# TODO: support CSR matrices without non-zeros elements
X_csr_0_nnz = csr_matrix(X * 0)
assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric)
# CSR matrices with int64 indices and indptr (e.g. large nnz, or large n_features)
# aren't supported as of now.
# See: https://github.com/scikit-learn/scikit-learn/issues/23653
# TODO: support CSR matrices with int64 indices and indptr
X_csr_int64 = csr_matrix(X)
X_csr_int64.indices = X_csr_int64.indices.astype(np.int64)
assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_int64, Y, metric)
def test_argkmin_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
k = 5
metric = "euclidean"
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float32 and Y.dtype=float64"
)
with pytest.raises(ValueError, match=msg):
ArgKmin.compute(X=X.astype(np.float32), Y=Y, k=k, metric=metric)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float64 and Y.dtype=int32"
)
with pytest.raises(ValueError, match=msg):
ArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
with pytest.raises(ValueError, match="k == -1, must be >= 1."):
ArgKmin.compute(X=X, Y=Y, k=-1, metric=metric)
with pytest.raises(ValueError, match="k == 0, must be >= 1."):
ArgKmin.compute(X=X, Y=Y, k=0, metric=metric)
with pytest.raises(ValueError, match="Unrecognized metric"):
ArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric")
with pytest.raises(
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
):
ArgKmin.compute(X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric)
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
ArgKmin.compute(X=np.asfortranarray(X), Y=Y, k=k, metric=metric)
# A UserWarning must be raised in this case.
unused_metric_kwargs = {"p": 3}
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
with pytest.warns(UserWarning, match=message):
ArgKmin.compute(
X=X, Y=Y, k=k, metric=metric, metric_kwargs=unused_metric_kwargs
)
# A UserWarning must be raised in this case.
metric_kwargs = {
"p": 3, # unused
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
with pytest.warns(UserWarning, match=message):
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
def test_radius_neighbors_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
radius = 5
metric = "euclidean"
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float32 and Y.dtype=float64"
)
with pytest.raises(
ValueError,
match=msg,
):
RadiusNeighbors.compute(
X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float64 and Y.dtype=int32"
)
with pytest.raises(
ValueError,
match=msg,
):
RadiusNeighbors.compute(X=X, Y=Y.astype(np.int32), radius=radius, metric=metric)
with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
RadiusNeighbors.compute(X=X, Y=Y, radius=-1, metric=metric)
with pytest.raises(ValueError, match="Unrecognized metric"):
RadiusNeighbors.compute(X=X, Y=Y, radius=radius, metric="wrong metric")
with pytest.raises(
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
):
RadiusNeighbors.compute(
X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
)
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
RadiusNeighbors.compute(
X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
)
unused_metric_kwargs = {"p": 3}
# A UserWarning must be raised in this case.
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
with pytest.warns(UserWarning, match=message):
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=unused_metric_kwargs
)
# A UserWarning must be raised in this case.
metric_kwargs = {
"p": 3, # unused
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
with pytest.warns(UserWarning, match=message):
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
)
@pytest.mark.parametrize(
"n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_chunk_size_agnosticism(
global_random_seed,
Dispatcher,
n_samples_X,
n_samples_Y,
dtype,
n_features=100,
):
"""Check that results do not depend on the chunk size."""
rng = np.random.RandomState(global_random_seed)
spread = 100
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
ref_dist, ref_indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=256, # default
metric="manhattan",
return_distance=True,
**compute_parameters,
)
dist, indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=41,
metric="manhattan",
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
ref_dist, dist, ref_indices, indices, **check_parameters
)
@pytest.mark.parametrize(
"n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_n_threads_agnosticism(
global_random_seed,
Dispatcher,
n_samples_X,
n_samples_Y,
dtype,
n_features=100,
):
"""Check that results do not depend on the number of threads."""
rng = np.random.RandomState(global_random_seed)
spread = 100
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
ref_dist, ref_indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=25, # make sure we use multiple threads
return_distance=True,
**compute_parameters,
)
with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"):
dist, indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=25,
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
ref_dist, dist, ref_indices, indices, **check_parameters
)
@pytest.mark.parametrize(
"Dispatcher, dtype",
[
(ArgKmin, np.float64),
(RadiusNeighbors, np.float32),
(ArgKmin, np.float32),
(RadiusNeighbors, np.float64),
],
)
def test_format_agnosticism(
global_random_seed,
Dispatcher,
dtype,
):
"""Check that results do not depend on the format (dense, sparse) of the input."""
rng = np.random.RandomState(global_random_seed)
spread = 100
n_samples, n_features = 100, 100
X = rng.rand(n_samples, n_features).astype(dtype) * spread
Y = rng.rand(n_samples, n_features).astype(dtype) * spread
X_csr = csr_matrix(X)
Y_csr = csr_matrix(Y)
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
dist_dense, indices_dense = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=50,
return_distance=True,
**compute_parameters,
)
for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
if _X is X and _Y is Y:
continue
dist, indices = Dispatcher.compute(
_X,
_Y,
parameter,
chunk_size=50,
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
dist_dense,
dist,
indices_dense,
indices,
**check_parameters,
)
@pytest.mark.parametrize(
"n_samples_X, n_samples_Y", [(100, 100), (100, 500), (500, 100)]
)
@pytest.mark.parametrize(
"metric",
["euclidean", "minkowski", "manhattan", "infinity", "seuclidean", "haversine"],
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_strategies_consistency(
global_random_seed,
Dispatcher,
metric,
n_samples_X,
n_samples_Y,
dtype,
n_features=10,
):
"""Check that the results do not depend on the strategy used."""
rng = np.random.RandomState(global_random_seed)
spread = 100
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
# Haversine distance only accepts 2D data
if metric == "haversine":
X = np.ascontiguousarray(X[:, :2])
Y = np.ascontiguousarray(Y[:, :2])
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
dist_par_X, indices_par_X = Dispatcher.compute(
X,
Y,
parameter,
metric=metric,
# Taking the first
metric_kwargs=_get_metric_params_list(
metric, n_features, seed=global_random_seed
)[0],
# To be sure to use parallelization
chunk_size=n_samples_X // 4,
strategy="parallel_on_X",
return_distance=True,
**compute_parameters,
)
dist_par_Y, indices_par_Y = Dispatcher.compute(
X,
Y,
parameter,
metric=metric,
# Taking the first
metric_kwargs=_get_metric_params_list(
metric, n_features, seed=global_random_seed
)[0],
# To be sure to use parallelization
chunk_size=n_samples_Y // 4,
strategy="parallel_on_Y",
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, **check_parameters
)
# "Concrete Dispatchers"-specific tests
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("n_features", [50, 500])
@pytest.mark.parametrize("translation", [0, 1e6])
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_pairwise_distances_argkmin(
global_random_seed,
n_features,
translation,
metric,
strategy,
dtype,
n_samples=100,
k=10,
):
# TODO: can we easily fix this discrepancy?
edge_cases = [
(np.float32, "chebyshev", 1000000.0),
(np.float32, "cityblock", 1000000.0),
]
if (dtype, metric, translation) in edge_cases:
pytest.xfail("Numerical differences lead to small differences in results.")
rng = np.random.RandomState(global_random_seed)
spread = 1000
X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
X_csr = csr_matrix(X)
Y_csr = csr_matrix(Y)
# Haversine distance only accepts 2D data
if metric == "haversine":
X = np.ascontiguousarray(X[:, :2])
Y = np.ascontiguousarray(Y[:, :2])
metric_kwargs = _get_metric_params_list(metric, n_features)[0]
# Reference for argkmin results
if metric == "euclidean":
# Compare to scikit-learn GEMM optimized implementation
dist_matrix = euclidean_distances(X, Y)
else:
dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
# Taking argkmin (indices of the k smallest values)
argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
# Getting the associated distances
argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64)
for row_idx in range(argkmin_indices_ref.shape[0]):
argkmin_distances_ref[row_idx] = dist_matrix[
row_idx, argkmin_indices_ref[row_idx]
]
for _X, _Y in [(X, Y), (X_csr, Y_csr)]:
argkmin_distances, argkmin_indices = ArgKmin.compute(
_X,
_Y,
k,
metric=metric,
metric_kwargs=metric_kwargs,
return_distance=True,
# So as to have more than a chunk, forcing parallelism.
chunk_size=n_samples // 4,
strategy=strategy,
)
ASSERT_RESULT[(ArgKmin, dtype)](
argkmin_distances,
argkmin_distances_ref,
argkmin_indices,
argkmin_indices_ref,
)
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("n_features", [50, 500])
@pytest.mark.parametrize("translation", [0, 1e6])
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_pairwise_distances_radius_neighbors(
global_random_seed,
n_features,
translation,
metric,
strategy,
dtype,
n_samples=100,
):
rng = np.random.RandomState(global_random_seed)
spread = 1000
radius = spread * np.log(n_features)
X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
metric_kwargs = _get_metric_params_list(
metric, n_features, seed=global_random_seed
)[0]
# Reference for argkmin results
if metric == "euclidean":
# Compare to scikit-learn GEMM optimized implementation
dist_matrix = euclidean_distances(X, Y)
else:
dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
# Getting the neighbors for a given radius
neigh_indices_ref = []
neigh_distances_ref = []
for row in dist_matrix:
ind = np.arange(row.shape[0])[row <= radius]
dist = row[ind]
sort = np.argsort(dist)
ind, dist = ind[sort], dist[sort]
neigh_indices_ref.append(ind)
neigh_distances_ref.append(dist)
neigh_distances, neigh_indices = RadiusNeighbors.compute(
X,
Y,
radius,
metric=metric,
metric_kwargs=metric_kwargs,
return_distance=True,
# So as to have more than a chunk, forcing parallelism.
chunk_size=n_samples // 4,
strategy=strategy,
sort_results=True,
)
ASSERT_RESULT[(RadiusNeighbors, dtype)](
neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref, radius
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("metric", ["manhattan", "euclidean"])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_memmap_backed_data(
metric,
Dispatcher,
dtype,
):
"""Check that the results do not depend on the datasets writability."""
rng = np.random.RandomState(0)
spread = 100
n_samples, n_features = 128, 10
X = rng.rand(n_samples, n_features).astype(dtype) * spread
Y = rng.rand(n_samples, n_features).astype(dtype) * spread
# Create read only datasets
X_mm, Y_mm = create_memmap_backed_data([X, Y])
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
ref_dist, ref_indices = Dispatcher.compute(
X,
Y,
parameter,
metric=metric,
return_distance=True,
**compute_parameters,
)
dist_mm, indices_mm = Dispatcher.compute(
X_mm,
Y_mm,
parameter,
metric=metric,
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
ref_dist, dist_mm, ref_indices, indices_mm, **check_parameters
)
@pytest.mark.parametrize("n_samples", [100, 1000])
@pytest.mark.parametrize("n_features", [5, 10, 100])
@pytest.mark.parametrize("num_threads", [1, 2, 8])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_sqeuclidean_row_norms(
global_random_seed,
n_samples,
n_features,
num_threads,
dtype,
):
rng = np.random.RandomState(global_random_seed)
spread = 100
X = rng.rand(n_samples, n_features).astype(dtype) * spread
X_csr = csr_matrix(X)
sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads)
sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads)
assert_allclose(sq_row_norm_reference, sq_row_norm)
assert_allclose(sq_row_norm_reference, sq_row_norm_csr)
with pytest.raises(ValueError):
X = np.asfortranarray(X)
sqeuclidean_row_norms(X, num_threads=num_threads)