271 lines
9.6 KiB
Python
271 lines
9.6 KiB
Python
import numpy as np
|
|
import pytest
|
|
|
|
from sklearn.datasets import make_classification, make_regression
|
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
|
from sklearn.feature_selection._mutual_info import _compute_mi
|
|
from sklearn.utils import check_random_state
|
|
from sklearn.utils._testing import (
|
|
assert_allclose,
|
|
assert_array_equal,
|
|
)
|
|
from sklearn.utils.fixes import CSR_CONTAINERS
|
|
|
|
|
|
def test_compute_mi_dd():
|
|
# In discrete case computations are straightforward and can be done
|
|
# by hand on given vectors.
|
|
x = np.array([0, 1, 1, 0, 0])
|
|
y = np.array([1, 0, 0, 0, 1])
|
|
|
|
H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5)
|
|
H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5)
|
|
I_xy = H_x + H_y - H_xy
|
|
|
|
assert_allclose(_compute_mi(x, y, x_discrete=True, y_discrete=True), I_xy)
|
|
|
|
|
|
def test_compute_mi_cc(global_dtype):
|
|
# For two continuous variables a good approach is to test on bivariate
|
|
# normal distribution, where mutual information is known.
|
|
|
|
# Mean of the distribution, irrelevant for mutual information.
|
|
mean = np.zeros(2)
|
|
|
|
# Setup covariance matrix with correlation coeff. equal 0.5.
|
|
sigma_1 = 1
|
|
sigma_2 = 10
|
|
corr = 0.5
|
|
cov = np.array(
|
|
[
|
|
[sigma_1**2, corr * sigma_1 * sigma_2],
|
|
[corr * sigma_1 * sigma_2, sigma_2**2],
|
|
]
|
|
)
|
|
|
|
# True theoretical mutual information.
|
|
I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov))
|
|
|
|
rng = check_random_state(0)
|
|
Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False)
|
|
|
|
x, y = Z[:, 0], Z[:, 1]
|
|
|
|
# Theory and computed values won't be very close
|
|
# We here check with a large relative tolerance
|
|
for n_neighbors in [3, 5, 7]:
|
|
I_computed = _compute_mi(
|
|
x, y, x_discrete=False, y_discrete=False, n_neighbors=n_neighbors
|
|
)
|
|
assert_allclose(I_computed, I_theory, rtol=1e-1)
|
|
|
|
|
|
def test_compute_mi_cd(global_dtype):
|
|
# To test define a joint distribution as follows:
|
|
# p(x, y) = p(x) p(y | x)
|
|
# X ~ Bernoulli(p)
|
|
# (Y | x = 0) ~ Uniform(-1, 1)
|
|
# (Y | x = 1) ~ Uniform(0, 2)
|
|
|
|
# Use the following formula for mutual information:
|
|
# I(X; Y) = H(Y) - H(Y | X)
|
|
# Two entropies can be computed by hand:
|
|
# H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2)
|
|
# H(Y | X) = ln(2)
|
|
|
|
# Now we need to implement sampling from out distribution, which is
|
|
# done easily using conditional distribution logic.
|
|
|
|
n_samples = 1000
|
|
rng = check_random_state(0)
|
|
|
|
for p in [0.3, 0.5, 0.7]:
|
|
x = rng.uniform(size=n_samples) > p
|
|
|
|
y = np.empty(n_samples, global_dtype)
|
|
mask = x == 0
|
|
y[mask] = rng.uniform(-1, 1, size=np.sum(mask))
|
|
y[~mask] = rng.uniform(0, 2, size=np.sum(~mask))
|
|
|
|
I_theory = -0.5 * (
|
|
(1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5)
|
|
) - np.log(2)
|
|
|
|
# Assert the same tolerance.
|
|
for n_neighbors in [3, 5, 7]:
|
|
I_computed = _compute_mi(
|
|
x, y, x_discrete=True, y_discrete=False, n_neighbors=n_neighbors
|
|
)
|
|
assert_allclose(I_computed, I_theory, rtol=1e-1)
|
|
|
|
|
|
def test_compute_mi_cd_unique_label(global_dtype):
|
|
# Test that adding unique label doesn't change MI.
|
|
n_samples = 100
|
|
x = np.random.uniform(size=n_samples) > 0.5
|
|
|
|
y = np.empty(n_samples, global_dtype)
|
|
mask = x == 0
|
|
y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))
|
|
y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))
|
|
|
|
mi_1 = _compute_mi(x, y, x_discrete=True, y_discrete=False)
|
|
|
|
x = np.hstack((x, 2))
|
|
y = np.hstack((y, 10))
|
|
mi_2 = _compute_mi(x, y, x_discrete=True, y_discrete=False)
|
|
|
|
assert_allclose(mi_1, mi_2)
|
|
|
|
|
|
# We are going test that feature ordering by MI matches our expectations.
|
|
def test_mutual_info_classif_discrete(global_dtype):
|
|
X = np.array(
|
|
[[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype
|
|
)
|
|
y = np.array([0, 1, 2, 2, 1])
|
|
|
|
# Here X[:, 0] is the most informative feature, and X[:, 1] is weakly
|
|
# informative.
|
|
mi = mutual_info_classif(X, y, discrete_features=True)
|
|
assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))
|
|
|
|
|
|
def test_mutual_info_regression(global_dtype):
|
|
# We generate sample from multivariate normal distribution, using
|
|
# transformation from initially uncorrelated variables. The zero
|
|
# variables after transformation is selected as the target vector,
|
|
# it has the strongest correlation with the variable 2, and
|
|
# the weakest correlation with the variable 1.
|
|
T = np.array([[1, 0.5, 2, 1], [0, 1, 0.1, 0.0], [0, 0.1, 1, 0.1], [0, 0.1, 0.1, 1]])
|
|
cov = T.dot(T.T)
|
|
mean = np.zeros(4)
|
|
|
|
rng = check_random_state(0)
|
|
Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False)
|
|
X = Z[:, 1:]
|
|
y = Z[:, 0]
|
|
|
|
mi = mutual_info_regression(X, y, random_state=0)
|
|
assert_array_equal(np.argsort(-mi), np.array([1, 2, 0]))
|
|
# XXX: should mutual_info_regression be fixed to avoid
|
|
# up-casting float32 inputs to float64?
|
|
assert mi.dtype == np.float64
|
|
|
|
|
|
def test_mutual_info_classif_mixed(global_dtype):
|
|
# Here the target is discrete and there are two continuous and one
|
|
# discrete feature. The idea of this test is clear from the code.
|
|
rng = check_random_state(0)
|
|
X = rng.rand(1000, 3).astype(global_dtype, copy=False)
|
|
X[:, 1] += X[:, 0]
|
|
y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
|
|
X[:, 2] = X[:, 2] > 0.5
|
|
|
|
mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0)
|
|
assert_array_equal(np.argsort(-mi), [2, 0, 1])
|
|
for n_neighbors in [5, 7, 9]:
|
|
mi_nn = mutual_info_classif(
|
|
X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0
|
|
)
|
|
# Check that the continuous values have an higher MI with greater
|
|
# n_neighbors
|
|
assert mi_nn[0] > mi[0]
|
|
assert mi_nn[1] > mi[1]
|
|
# The n_neighbors should not have any effect on the discrete value
|
|
# The MI should be the same
|
|
assert mi_nn[2] == mi[2]
|
|
|
|
|
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
|
def test_mutual_info_options(global_dtype, csr_container):
|
|
X = np.array(
|
|
[[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype
|
|
)
|
|
y = np.array([0, 1, 2, 2, 1], dtype=global_dtype)
|
|
X_csr = csr_container(X)
|
|
|
|
for mutual_info in (mutual_info_regression, mutual_info_classif):
|
|
with pytest.raises(ValueError):
|
|
mutual_info(X_csr, y, discrete_features=False)
|
|
with pytest.raises(ValueError):
|
|
mutual_info(X, y, discrete_features="manual")
|
|
with pytest.raises(ValueError):
|
|
mutual_info(X_csr, y, discrete_features=[True, False, True])
|
|
with pytest.raises(IndexError):
|
|
mutual_info(X, y, discrete_features=[True, False, True, False])
|
|
with pytest.raises(IndexError):
|
|
mutual_info(X, y, discrete_features=[1, 4])
|
|
|
|
mi_1 = mutual_info(X, y, discrete_features="auto", random_state=0)
|
|
mi_2 = mutual_info(X, y, discrete_features=False, random_state=0)
|
|
mi_3 = mutual_info(X_csr, y, discrete_features="auto", random_state=0)
|
|
mi_4 = mutual_info(X_csr, y, discrete_features=True, random_state=0)
|
|
mi_5 = mutual_info(X, y, discrete_features=[True, False, True], random_state=0)
|
|
mi_6 = mutual_info(X, y, discrete_features=[0, 2], random_state=0)
|
|
|
|
assert_allclose(mi_1, mi_2)
|
|
assert_allclose(mi_3, mi_4)
|
|
assert_allclose(mi_5, mi_6)
|
|
|
|
assert not np.allclose(mi_1, mi_3)
|
|
|
|
|
|
@pytest.mark.parametrize("correlated", [True, False])
|
|
def test_mutual_information_symmetry_classif_regression(correlated, global_random_seed):
|
|
"""Check that `mutual_info_classif` and `mutual_info_regression` are
|
|
symmetric by switching the target `y` as `feature` in `X` and vice
|
|
versa.
|
|
|
|
Non-regression test for:
|
|
https://github.com/scikit-learn/scikit-learn/issues/23720
|
|
"""
|
|
rng = np.random.RandomState(global_random_seed)
|
|
n = 100
|
|
d = rng.randint(10, size=n)
|
|
|
|
if correlated:
|
|
c = d.astype(np.float64)
|
|
else:
|
|
c = rng.normal(0, 1, size=n)
|
|
|
|
mi_classif = mutual_info_classif(
|
|
c[:, None], d, discrete_features=[False], random_state=global_random_seed
|
|
)
|
|
|
|
mi_regression = mutual_info_regression(
|
|
d[:, None], c, discrete_features=[True], random_state=global_random_seed
|
|
)
|
|
|
|
assert mi_classif == pytest.approx(mi_regression)
|
|
|
|
|
|
def test_mutual_info_regression_X_int_dtype(global_random_seed):
|
|
"""Check that results agree when X is integer dtype and float dtype.
|
|
|
|
Non-regression test for Issue #26696.
|
|
"""
|
|
rng = np.random.RandomState(global_random_seed)
|
|
X = rng.randint(100, size=(100, 10))
|
|
X_float = X.astype(np.float64, copy=True)
|
|
y = rng.randint(100, size=100)
|
|
|
|
expected = mutual_info_regression(X_float, y, random_state=global_random_seed)
|
|
result = mutual_info_regression(X, y, random_state=global_random_seed)
|
|
assert_allclose(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"mutual_info_func, data_generator",
|
|
[
|
|
(mutual_info_regression, make_regression),
|
|
(mutual_info_classif, make_classification),
|
|
],
|
|
)
|
|
def test_mutual_info_n_jobs(global_random_seed, mutual_info_func, data_generator):
|
|
"""Check that results are consistent with different `n_jobs`."""
|
|
X, y = data_generator(random_state=global_random_seed)
|
|
single_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=1)
|
|
multi_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=2)
|
|
assert_allclose(single_job, multi_job)
|