Inzynierka/Lib/site-packages/sklearn/metrics/tests/test_dist_metrics.py

457 lines
16 KiB
Python
Raw Normal View History

2023-06-02 12:51:02 +02:00
import itertools
import pickle
import copy
import numpy as np
import pytest
import scipy.sparse as sp
from scipy.spatial.distance import cdist
from sklearn.metrics import DistanceMetric
from sklearn.metrics._dist_metrics import (
BOOL_METRICS,
# Unexposed private DistanceMetric for 32 bit
DistanceMetric32,
)
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_allclose, create_memmap_backed_data
from sklearn.utils.fixes import sp_version, parse_version
def dist_func(x1, x2, p):
return np.sum((x1 - x2) ** p) ** (1.0 / p)
rng = check_random_state(0)
d = 4
n1 = 20
n2 = 25
X64 = rng.random_sample((n1, d))
Y64 = rng.random_sample((n2, d))
X32 = X64.astype("float32")
Y32 = Y64.astype("float32")
[X_mmap, Y_mmap] = create_memmap_backed_data([X64, Y64])
# make boolean arrays: ones and zeros
X_bool = (X64 < 0.3).astype(np.float64) # quite sparse
Y_bool = (Y64 < 0.7).astype(np.float64) # not too sparse
[X_bool_mmap, Y_bool_mmap] = create_memmap_backed_data([X_bool, Y_bool])
V = rng.random_sample((d, d))
VI = np.dot(V, V.T)
METRICS_DEFAULT_PARAMS = [
("euclidean", {}),
("cityblock", {}),
("minkowski", dict(p=(1, 1.5, 2, 3))),
("chebyshev", {}),
("seuclidean", dict(V=(rng.random_sample(d),))),
("mahalanobis", dict(VI=(VI,))),
("hamming", {}),
("canberra", {}),
("braycurtis", {}),
]
if sp_version >= parse_version("1.8.0.dev0"):
# Starting from scipy 1.8.0.dev0, minkowski now accepts w, the weighting
# parameter directly and using it is preferred over using wminkowski.
METRICS_DEFAULT_PARAMS.append(
("minkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))),
)
else:
# For previous versions of scipy, this was possible through a dedicated
# metric (deprecated in 1.6 and removed in 1.8).
METRICS_DEFAULT_PARAMS.append(
("wminkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))),
)
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize(
"metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0]
)
@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
def test_cdist(metric_param_grid, X, Y):
DistanceMetricInterface = (
DistanceMetric if X.dtype == Y.dtype == np.float64 else DistanceMetric32
)
metric, param_grid = metric_param_grid
keys = param_grid.keys()
X_csr, Y_csr = sp.csr_matrix(X), sp.csr_matrix(Y)
for vals in itertools.product(*param_grid.values()):
kwargs = dict(zip(keys, vals))
rtol_dict = {}
if metric == "mahalanobis" and X.dtype == np.float32:
# Computation of mahalanobis differs between
# the scipy and scikit-learn implementation.
# Hence, we increase the relative tolerance.
# TODO: Inspect slight numerical discrepancy
# with scipy
rtol_dict = {"rtol": 1e-6}
if metric == "wminkowski":
# wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
WarningToExpect = None
if sp_version >= parse_version("1.6.0"):
WarningToExpect = DeprecationWarning
with pytest.warns(WarningToExpect):
D_scipy_cdist = cdist(X, Y, metric, **kwargs)
else:
D_scipy_cdist = cdist(X, Y, metric, **kwargs)
dm = DistanceMetricInterface.get_metric(metric, **kwargs)
# DistanceMetric.pairwise must be consistent for all
# combinations of formats in {sparse, dense}.
D_sklearn = dm.pairwise(X, Y)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict)
D_sklearn = dm.pairwise(X_csr, Y_csr)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict)
D_sklearn = dm.pairwise(X_csr, Y)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict)
D_sklearn = dm.pairwise(X, Y_csr)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict)
@pytest.mark.parametrize("metric", BOOL_METRICS)
@pytest.mark.parametrize(
"X_bool, Y_bool", [(X_bool, Y_bool), (X_bool_mmap, Y_bool_mmap)]
)
def test_cdist_bool_metric(metric, X_bool, Y_bool):
D_scipy_cdist = cdist(X_bool, Y_bool, metric)
dm = DistanceMetric.get_metric(metric)
D_sklearn = dm.pairwise(X_bool, Y_bool)
assert_allclose(D_sklearn, D_scipy_cdist)
# DistanceMetric.pairwise must be consistent
# on all combinations of format in {sparse, dense}².
X_bool_csr, Y_bool_csr = sp.csr_matrix(X_bool), sp.csr_matrix(Y_bool)
D_sklearn = dm.pairwise(X_bool, Y_bool)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_scipy_cdist)
D_sklearn = dm.pairwise(X_bool_csr, Y_bool_csr)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_scipy_cdist)
D_sklearn = dm.pairwise(X_bool, Y_bool_csr)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_scipy_cdist)
D_sklearn = dm.pairwise(X_bool_csr, Y_bool)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_scipy_cdist)
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize(
"metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0]
)
@pytest.mark.parametrize("X", [X64, X32, X_mmap])
def test_pdist(metric_param_grid, X):
DistanceMetricInterface = (
DistanceMetric if X.dtype == np.float64 else DistanceMetric32
)
metric, param_grid = metric_param_grid
keys = param_grid.keys()
X_csr = sp.csr_matrix(X)
for vals in itertools.product(*param_grid.values()):
kwargs = dict(zip(keys, vals))
rtol_dict = {}
if metric == "mahalanobis" and X.dtype == np.float32:
# Computation of mahalanobis differs between
# the scipy and scikit-learn implementation.
# Hence, we increase the relative tolerance.
# TODO: Inspect slight numerical discrepancy
# with scipy
rtol_dict = {"rtol": 1e-6}
if metric == "wminkowski":
if sp_version >= parse_version("1.8.0"):
pytest.skip("wminkowski will be removed in SciPy 1.8.0")
# wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
ExceptionToAssert = None
if sp_version >= parse_version("1.6.0"):
ExceptionToAssert = DeprecationWarning
with pytest.warns(ExceptionToAssert):
D_scipy_pdist = cdist(X, X, metric, **kwargs)
else:
D_scipy_pdist = cdist(X, X, metric, **kwargs)
dm = DistanceMetricInterface.get_metric(metric, **kwargs)
D_sklearn = dm.pairwise(X)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_scipy_pdist, **rtol_dict)
D_sklearn_csr = dm.pairwise(X_csr)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict)
D_sklearn_csr = dm.pairwise(X_csr, X_csr)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict)
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize(
"metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0]
)
def test_distance_metrics_dtype_consistency(metric_param_grid):
# DistanceMetric must return similar distances for both float32 and float64
# input data.
metric, param_grid = metric_param_grid
keys = param_grid.keys()
# Choose rtol to make sure that this test is robust to changes in the random
# seed in the module-level test data generation code.
rtol = 1e-5
for vals in itertools.product(*param_grid.values()):
kwargs = dict(zip(keys, vals))
dm64 = DistanceMetric.get_metric(metric, **kwargs)
dm32 = DistanceMetric32.get_metric(metric, **kwargs)
D64 = dm64.pairwise(X64)
D32 = dm32.pairwise(X32)
# Both results are np.float64 dtype because the accumulation accross
# features is done in float64. However the input data and the element
# wise arithmetic operations are done in float32 so we can expect a
# small discrepancy.
assert D64.dtype == D32.dtype == np.float64
# assert_allclose introspects the dtype of the input arrays to decide
# which rtol value to use by default but in this case we know that D32
# is not computed with the same precision so we set rtol manually.
assert_allclose(D64, D32, rtol=rtol)
D64 = dm64.pairwise(X64, Y64)
D32 = dm32.pairwise(X32, Y32)
assert_allclose(D64, D32, rtol=rtol)
@pytest.mark.parametrize("metric", BOOL_METRICS)
@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap])
def test_pdist_bool_metrics(metric, X_bool):
D_scipy_pdist = cdist(X_bool, X_bool, metric)
dm = DistanceMetric.get_metric(metric)
D_sklearn = dm.pairwise(X_bool)
assert_allclose(D_sklearn, D_scipy_pdist)
X_bool_csr = sp.csr_matrix(X_bool)
D_sklearn = dm.pairwise(X_bool_csr)
assert_allclose(D_sklearn, D_scipy_pdist)
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("writable_kwargs", [True, False])
@pytest.mark.parametrize(
"metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0]
)
@pytest.mark.parametrize("X", [X64, X32])
def test_pickle(writable_kwargs, metric_param_grid, X):
DistanceMetricInterface = (
DistanceMetric if X.dtype == np.float64 else DistanceMetric32
)
metric, param_grid = metric_param_grid
keys = param_grid.keys()
for vals in itertools.product(*param_grid.values()):
if any(isinstance(val, np.ndarray) for val in vals):
vals = copy.deepcopy(vals)
for val in vals:
if isinstance(val, np.ndarray):
val.setflags(write=writable_kwargs)
kwargs = dict(zip(keys, vals))
dm = DistanceMetricInterface.get_metric(metric, **kwargs)
D1 = dm.pairwise(X)
dm2 = pickle.loads(pickle.dumps(dm))
D2 = dm2.pairwise(X)
assert_allclose(D1, D2)
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("metric", BOOL_METRICS)
@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap])
def test_pickle_bool_metrics(metric, X_bool):
dm = DistanceMetric.get_metric(metric)
D1 = dm.pairwise(X_bool)
dm2 = pickle.loads(pickle.dumps(dm))
D2 = dm2.pairwise(X_bool)
assert_allclose(D1, D2)
@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
def test_haversine_metric(X, Y):
DistanceMetricInterface = (
DistanceMetric if X.dtype == np.float64 else DistanceMetric32
)
# The Haversine DistanceMetric only works on 2 features.
X = np.asarray(X[:, :2])
Y = np.asarray(Y[:, :2])
X_csr, Y_csr = sp.csr_matrix(X), sp.csr_matrix(Y)
# Haversine is not supported by scipy.special.distance.{cdist,pdist}
# So we reimplement it to have a reference.
def haversine_slow(x1, x2):
return 2 * np.arcsin(
np.sqrt(
np.sin(0.5 * (x1[0] - x2[0])) ** 2
+ np.cos(x1[0]) * np.cos(x2[0]) * np.sin(0.5 * (x1[1] - x2[1])) ** 2
)
)
D_reference = np.zeros((X_csr.shape[0], Y_csr.shape[0]))
for i, xi in enumerate(X):
for j, yj in enumerate(Y):
D_reference[i, j] = haversine_slow(xi, yj)
haversine = DistanceMetricInterface.get_metric("haversine")
D_sklearn = haversine.pairwise(X, Y)
assert_allclose(
haversine.dist_to_rdist(D_sklearn), np.sin(0.5 * D_reference) ** 2, rtol=1e-6
)
assert_allclose(D_sklearn, D_reference)
D_sklearn = haversine.pairwise(X_csr, Y_csr)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_reference)
D_sklearn = haversine.pairwise(X_csr, Y)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_reference)
D_sklearn = haversine.pairwise(X, Y_csr)
assert D_sklearn.flags.c_contiguous
assert_allclose(D_sklearn, D_reference)
def test_pyfunc_metric():
X = np.random.random((10, 3))
euclidean = DistanceMetric.get_metric("euclidean")
pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func, p=2)
# Check if both callable metric and predefined metric initialized
# DistanceMetric object is picklable
euclidean_pkl = pickle.loads(pickle.dumps(euclidean))
pyfunc_pkl = pickle.loads(pickle.dumps(pyfunc))
D1 = euclidean.pairwise(X)
D2 = pyfunc.pairwise(X)
D1_pkl = euclidean_pkl.pairwise(X)
D2_pkl = pyfunc_pkl.pairwise(X)
assert_allclose(D1, D2)
assert_allclose(D1_pkl, D2_pkl)
def test_input_data_size():
# Regression test for #6288
# Previously, a metric requiring a particular input dimension would fail
def custom_metric(x, y):
assert x.shape[0] == 3
return np.sum((x - y) ** 2)
rng = check_random_state(0)
X = rng.rand(10, 3)
pyfunc = DistanceMetric.get_metric("pyfunc", func=custom_metric)
eucl = DistanceMetric.get_metric("euclidean")
assert_allclose(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
def test_readonly_kwargs():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/21685
rng = check_random_state(0)
weights = rng.rand(100)
VI = rng.rand(10, 10)
weights.setflags(write=False)
VI.setflags(write=False)
# Those distances metrics have to support readonly buffers.
DistanceMetric.get_metric("seuclidean", V=weights)
DistanceMetric.get_metric("wminkowski", p=1, w=weights)
DistanceMetric.get_metric("mahalanobis", VI=VI)
@pytest.mark.parametrize(
"w, err_type, err_msg",
[
(np.array([1, 1.5, -13]), ValueError, "w cannot contain negative weights"),
(np.array([1, 1.5, np.nan]), ValueError, "w contains NaN"),
(
sp.csr_matrix([1, 1.5, 1]),
TypeError,
"A sparse matrix was passed, but dense data is required",
),
(np.array(["a", "b", "c"]), ValueError, "could not convert string to float"),
(np.array([]), ValueError, "a minimum of 1 is required"),
],
)
def test_minkowski_metric_validate_weights_values(w, err_type, err_msg):
with pytest.raises(err_type, match=err_msg):
DistanceMetric.get_metric("minkowski", p=3, w=w)
def test_minkowski_metric_validate_weights_size():
w2 = rng.random_sample(d + 1)
dm = DistanceMetric.get_metric("minkowski", p=3, w=w2)
msg = (
"MinkowskiDistance: the size of w must match "
f"the number of features \\({X64.shape[1]}\\). "
f"Currently len\\(w\\)={w2.shape[0]}."
)
with pytest.raises(ValueError, match=msg):
dm.pairwise(X64, Y64)
# TODO: Remove in 1.3 when wminkowski is removed
def test_wminkowski_deprecated():
w = rng.random_sample(d)
msg = "WMinkowskiDistance is deprecated in version 1.1"
with pytest.warns(FutureWarning, match=msg):
DistanceMetric.get_metric("wminkowski", p=3, w=w)
# TODO: Remove in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("p", [1, 1.5, 3])
def test_wminkowski_minkowski_equivalence(p):
w = rng.random_sample(d)
# Weights are rescaled for consistency w.r.t scipy 1.8 refactoring of 'minkowski'
dm_wmks = DistanceMetric.get_metric("wminkowski", p=p, w=(w) ** (1 / p))
dm_mks = DistanceMetric.get_metric("minkowski", p=p, w=w)
D_wmks = dm_wmks.pairwise(X64, Y64)
D_mks = dm_mks.pairwise(X64, Y64)
assert_allclose(D_wmks, D_mks)