3RNN/Lib/site-packages/sklearn/feature_selection/tests/test_mutual_info.py

import numpy as np
import pytest

from sklearn.datasets import make_classification, make_regression
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection._mutual_info import _compute_mi
from sklearn.utils import check_random_state
from sklearn.utils._testing import (
    assert_allclose,
    assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS


def test_compute_mi_dd():
    # In discrete case computations are straightforward and can be done
    # by hand on given vectors.
    x = np.array([0, 1, 1, 0, 0])
    y = np.array([1, 0, 0, 0, 1])

    H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5)
    H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5)
    I_xy = H_x + H_y - H_xy

    assert_allclose(_compute_mi(x, y, x_discrete=True, y_discrete=True), I_xy)


def test_compute_mi_cc(global_dtype):
    # For two continuous variables a good approach is to test on bivariate
    # normal distribution, where mutual information is known.

    # Mean of the distribution, irrelevant for mutual information.
    mean = np.zeros(2)

    # Setup covariance matrix with correlation coeff. equal 0.5.
    sigma_1 = 1
    sigma_2 = 10
    corr = 0.5
    cov = np.array(
        [
            [sigma_1**2, corr * sigma_1 * sigma_2],
            [corr * sigma_1 * sigma_2, sigma_2**2],
        ]
    )

    # True theoretical mutual information.
    I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov))

    rng = check_random_state(0)
    Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False)

    x, y = Z[:, 0], Z[:, 1]

    # Theory and computed values won't be very close
    # We here check with a large relative tolerance
    for n_neighbors in [3, 5, 7]:
        I_computed = _compute_mi(
            x, y, x_discrete=False, y_discrete=False, n_neighbors=n_neighbors
        )
        assert_allclose(I_computed, I_theory, rtol=1e-1)


def test_compute_mi_cd(global_dtype):
    # To test define a joint distribution as follows:
    # p(x, y) = p(x) p(y | x)
    # X ~ Bernoulli(p)
    # (Y | x = 0) ~ Uniform(-1, 1)
    # (Y | x = 1) ~ Uniform(0, 2)

    # Use the following formula for mutual information:
    # I(X; Y) = H(Y) - H(Y | X)
    # Two entropies can be computed by hand:
    # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2)
    # H(Y | X) = ln(2)

    # Now we need to implement sampling from out distribution, which is
    # done easily using conditional distribution logic.

    n_samples = 1000
    rng = check_random_state(0)

    for p in [0.3, 0.5, 0.7]:
        x = rng.uniform(size=n_samples) > p

        y = np.empty(n_samples, global_dtype)
        mask = x == 0
        y[mask] = rng.uniform(-1, 1, size=np.sum(mask))
        y[~mask] = rng.uniform(0, 2, size=np.sum(~mask))

        I_theory = -0.5 * (
            (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5)
        ) - np.log(2)

        # Assert the same tolerance.
        for n_neighbors in [3, 5, 7]:
            I_computed = _compute_mi(
                x, y, x_discrete=True, y_discrete=False, n_neighbors=n_neighbors
            )
            assert_allclose(I_computed, I_theory, rtol=1e-1)


def test_compute_mi_cd_unique_label(global_dtype):
    # Test that adding unique label doesn't change MI.
    n_samples = 100
    x = np.random.uniform(size=n_samples) > 0.5

    y = np.empty(n_samples, global_dtype)
    mask = x == 0
    y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))
    y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))

    mi_1 = _compute_mi(x, y, x_discrete=True, y_discrete=False)

    x = np.hstack((x, 2))
    y = np.hstack((y, 10))
    mi_2 = _compute_mi(x, y, x_discrete=True, y_discrete=False)

    assert_allclose(mi_1, mi_2)


# We are going test that feature ordering by MI matches our expectations.
def test_mutual_info_classif_discrete(global_dtype):
    X = np.array(
        [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype
    )
    y = np.array([0, 1, 2, 2, 1])

    # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly
    # informative.
    mi = mutual_info_classif(X, y, discrete_features=True)
    assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))


def test_mutual_info_regression(global_dtype):
    # We generate sample from multivariate normal distribution, using
    # transformation from initially uncorrelated variables. The zero
    # variables after transformation is selected as the target vector,
    # it has the strongest correlation with the variable 2, and
    # the weakest correlation with the variable 1.
    T = np.array([[1, 0.5, 2, 1], [0, 1, 0.1, 0.0], [0, 0.1, 1, 0.1], [0, 0.1, 0.1, 1]])
    cov = T.dot(T.T)
    mean = np.zeros(4)

    rng = check_random_state(0)
    Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False)
    X = Z[:, 1:]
    y = Z[:, 0]

    mi = mutual_info_regression(X, y, random_state=0)
    assert_array_equal(np.argsort(-mi), np.array([1, 2, 0]))
    # XXX: should mutual_info_regression be fixed to avoid
    # up-casting float32 inputs to float64?
    assert mi.dtype == np.float64


def test_mutual_info_classif_mixed(global_dtype):
    # Here the target is discrete and there are two continuous and one
    # discrete feature. The idea of this test is clear from the code.
    rng = check_random_state(0)
    X = rng.rand(1000, 3).astype(global_dtype, copy=False)
    X[:, 1] += X[:, 0]
    y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
    X[:, 2] = X[:, 2] > 0.5

    mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0)
    assert_array_equal(np.argsort(-mi), [2, 0, 1])
    for n_neighbors in [5, 7, 9]:
        mi_nn = mutual_info_classif(
            X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0
        )
        # Check that the continuous values have an higher MI with greater
        # n_neighbors
        assert mi_nn[0] > mi[0]
        assert mi_nn[1] > mi[1]
        # The n_neighbors should not have any effect on the discrete value
        # The MI should be the same
        assert mi_nn[2] == mi[2]


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_mutual_info_options(global_dtype, csr_container):
    X = np.array(
        [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype
    )
    y = np.array([0, 1, 2, 2, 1], dtype=global_dtype)
    X_csr = csr_container(X)

    for mutual_info in (mutual_info_regression, mutual_info_classif):
        with pytest.raises(ValueError):
            mutual_info(X_csr, y, discrete_features=False)
        with pytest.raises(ValueError):
            mutual_info(X, y, discrete_features="manual")
        with pytest.raises(ValueError):
            mutual_info(X_csr, y, discrete_features=[True, False, True])
        with pytest.raises(IndexError):
            mutual_info(X, y, discrete_features=[True, False, True, False])
        with pytest.raises(IndexError):
            mutual_info(X, y, discrete_features=[1, 4])

        mi_1 = mutual_info(X, y, discrete_features="auto", random_state=0)
        mi_2 = mutual_info(X, y, discrete_features=False, random_state=0)
        mi_3 = mutual_info(X_csr, y, discrete_features="auto", random_state=0)
        mi_4 = mutual_info(X_csr, y, discrete_features=True, random_state=0)
        mi_5 = mutual_info(X, y, discrete_features=[True, False, True], random_state=0)
        mi_6 = mutual_info(X, y, discrete_features=[0, 2], random_state=0)

        assert_allclose(mi_1, mi_2)
        assert_allclose(mi_3, mi_4)
        assert_allclose(mi_5, mi_6)

        assert not np.allclose(mi_1, mi_3)


@pytest.mark.parametrize("correlated", [True, False])
def test_mutual_information_symmetry_classif_regression(correlated, global_random_seed):
    """Check that `mutual_info_classif` and `mutual_info_regression` are
    symmetric by switching the target `y` as `feature` in `X` and vice
    versa.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/23720
    """
    rng = np.random.RandomState(global_random_seed)
    n = 100
    d = rng.randint(10, size=n)

    if correlated:
        c = d.astype(np.float64)
    else:
        c = rng.normal(0, 1, size=n)

    mi_classif = mutual_info_classif(
        c[:, None], d, discrete_features=[False], random_state=global_random_seed
    )

    mi_regression = mutual_info_regression(
        d[:, None], c, discrete_features=[True], random_state=global_random_seed
    )

    assert mi_classif == pytest.approx(mi_regression)


def test_mutual_info_regression_X_int_dtype(global_random_seed):
    """Check that results agree when X is integer dtype and float dtype.

    Non-regression test for Issue #26696.
    """
    rng = np.random.RandomState(global_random_seed)
    X = rng.randint(100, size=(100, 10))
    X_float = X.astype(np.float64, copy=True)
    y = rng.randint(100, size=100)

    expected = mutual_info_regression(X_float, y, random_state=global_random_seed)
    result = mutual_info_regression(X, y, random_state=global_random_seed)
    assert_allclose(result, expected)


@pytest.mark.parametrize(
    "mutual_info_func, data_generator",
    [
        (mutual_info_regression, make_regression),
        (mutual_info_classif, make_classification),
    ],
)
def test_mutual_info_n_jobs(global_random_seed, mutual_info_func, data_generator):
    """Check that results are consistent with different `n_jobs`."""
    X, y = data_generator(random_state=global_random_seed)
    single_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=1)
    multi_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=2)
    assert_allclose(single_job, multi_job)
1.0 2024-05-26 19:49:15 +02:00			`import numpy as np`
			`import pytest`

			`from sklearn.datasets import make_classification, make_regression`
			`from sklearn.feature_selection import mutual_info_classif, mutual_info_regression`
			`from sklearn.feature_selection._mutual_info import _compute_mi`
			`from sklearn.utils import check_random_state`
			`from sklearn.utils._testing import (`
			`assert_allclose,`
			`assert_array_equal,`
			`)`
			`from sklearn.utils.fixes import CSR_CONTAINERS`


			`def test_compute_mi_dd():`
			`# In discrete case computations are straightforward and can be done`
			`# by hand on given vectors.`
			`x = np.array([0, 1, 1, 0, 0])`
			`y = np.array([1, 0, 0, 0, 1])`

			`H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5)`
			`H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5)`
			`I_xy = H_x + H_y - H_xy`

			`assert_allclose(_compute_mi(x, y, x_discrete=True, y_discrete=True), I_xy)`


			`def test_compute_mi_cc(global_dtype):`
			`# For two continuous variables a good approach is to test on bivariate`
			`# normal distribution, where mutual information is known.`

			`# Mean of the distribution, irrelevant for mutual information.`
			`mean = np.zeros(2)`

			`# Setup covariance matrix with correlation coeff. equal 0.5.`
			`sigma_1 = 1`
			`sigma_2 = 10`
			`corr = 0.5`
			`cov = np.array(`
			`[`
			`[sigma_1*2, corr sigma_1 * sigma_2],`
			`[corr * sigma_1 * sigma_2, sigma_2**2],`
			`]`
			`)`

			`# True theoretical mutual information.`
			`I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov))`

			`rng = check_random_state(0)`
			`Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False)`

			`x, y = Z[:, 0], Z[:, 1]`

			`# Theory and computed values won't be very close`
			`# We here check with a large relative tolerance`
			`for n_neighbors in [3, 5, 7]:`
			`I_computed = _compute_mi(`
			`x, y, x_discrete=False, y_discrete=False, n_neighbors=n_neighbors`
			`)`
			`assert_allclose(I_computed, I_theory, rtol=1e-1)`


			`def test_compute_mi_cd(global_dtype):`
			`# To test define a joint distribution as follows:`
			`# p(x, y) = p(x) p(y \| x)`
			`# X ~ Bernoulli(p)`
			`# (Y \| x = 0) ~ Uniform(-1, 1)`
			`# (Y \| x = 1) ~ Uniform(0, 2)`

			`# Use the following formula for mutual information:`
			`# I(X; Y) = H(Y) - H(Y \| X)`
			`# Two entropies can be computed by hand:`
			`# H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2log(p/2) - 1/2log(1/2)`
			`# H(Y \| X) = ln(2)`

			`# Now we need to implement sampling from out distribution, which is`
			`# done easily using conditional distribution logic.`

			`n_samples = 1000`
			`rng = check_random_state(0)`

			`for p in [0.3, 0.5, 0.7]:`
			`x = rng.uniform(size=n_samples) > p`

			`y = np.empty(n_samples, global_dtype)`
			`mask = x == 0`
			`y[mask] = rng.uniform(-1, 1, size=np.sum(mask))`
			`y[~mask] = rng.uniform(0, 2, size=np.sum(~mask))`

			`I_theory = -0.5 * (`
			`(1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5)`
			`) - np.log(2)`

			`# Assert the same tolerance.`
			`for n_neighbors in [3, 5, 7]:`
			`I_computed = _compute_mi(`
			`x, y, x_discrete=True, y_discrete=False, n_neighbors=n_neighbors`
			`)`
			`assert_allclose(I_computed, I_theory, rtol=1e-1)`


			`def test_compute_mi_cd_unique_label(global_dtype):`
			`# Test that adding unique label doesn't change MI.`
			`n_samples = 100`
			`x = np.random.uniform(size=n_samples) > 0.5`

			`y = np.empty(n_samples, global_dtype)`
			`mask = x == 0`
			`y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))`
			`y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))`

			`mi_1 = _compute_mi(x, y, x_discrete=True, y_discrete=False)`

			`x = np.hstack((x, 2))`
			`y = np.hstack((y, 10))`
			`mi_2 = _compute_mi(x, y, x_discrete=True, y_discrete=False)`

			`assert_allclose(mi_1, mi_2)`


			`# We are going test that feature ordering by MI matches our expectations.`
			`def test_mutual_info_classif_discrete(global_dtype):`
			`X = np.array(`
			`[[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype`
			`)`
			`y = np.array([0, 1, 2, 2, 1])`

			`# Here X[:, 0] is the most informative feature, and X[:, 1] is weakly`
			`# informative.`
			`mi = mutual_info_classif(X, y, discrete_features=True)`
			`assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))`


			`def test_mutual_info_regression(global_dtype):`
			`# We generate sample from multivariate normal distribution, using`
			`# transformation from initially uncorrelated variables. The zero`
			`# variables after transformation is selected as the target vector,`
			`# it has the strongest correlation with the variable 2, and`
			`# the weakest correlation with the variable 1.`
			`T = np.array([[1, 0.5, 2, 1], [0, 1, 0.1, 0.0], [0, 0.1, 1, 0.1], [0, 0.1, 0.1, 1]])`
			`cov = T.dot(T.T)`
			`mean = np.zeros(4)`

			`rng = check_random_state(0)`
			`Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False)`
			`X = Z[:, 1:]`
			`y = Z[:, 0]`

			`mi = mutual_info_regression(X, y, random_state=0)`
			`assert_array_equal(np.argsort(-mi), np.array([1, 2, 0]))`
			`# XXX: should mutual_info_regression be fixed to avoid`
			`# up-casting float32 inputs to float64?`
			`assert mi.dtype == np.float64`


			`def test_mutual_info_classif_mixed(global_dtype):`
			`# Here the target is discrete and there are two continuous and one`
			`# discrete feature. The idea of this test is clear from the code.`
			`rng = check_random_state(0)`
			`X = rng.rand(1000, 3).astype(global_dtype, copy=False)`
			`X[:, 1] += X[:, 0]`
			`y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)`
			`X[:, 2] = X[:, 2] > 0.5`

			`mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0)`
			`assert_array_equal(np.argsort(-mi), [2, 0, 1])`
			`for n_neighbors in [5, 7, 9]:`
			`mi_nn = mutual_info_classif(`
			`X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0`
			`)`
			`# Check that the continuous values have an higher MI with greater`
			`# n_neighbors`
			`assert mi_nn[0] > mi[0]`
			`assert mi_nn[1] > mi[1]`
			`# The n_neighbors should not have any effect on the discrete value`
			`# The MI should be the same`
			`assert mi_nn[2] == mi[2]`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_mutual_info_options(global_dtype, csr_container):`
			`X = np.array(`
			`[[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype`
			`)`
			`y = np.array([0, 1, 2, 2, 1], dtype=global_dtype)`
			`X_csr = csr_container(X)`

			`for mutual_info in (mutual_info_regression, mutual_info_classif):`
			`with pytest.raises(ValueError):`
			`mutual_info(X_csr, y, discrete_features=False)`
			`with pytest.raises(ValueError):`
			`mutual_info(X, y, discrete_features="manual")`
			`with pytest.raises(ValueError):`
			`mutual_info(X_csr, y, discrete_features=[True, False, True])`
			`with pytest.raises(IndexError):`
			`mutual_info(X, y, discrete_features=[True, False, True, False])`
			`with pytest.raises(IndexError):`
			`mutual_info(X, y, discrete_features=[1, 4])`

			`mi_1 = mutual_info(X, y, discrete_features="auto", random_state=0)`
			`mi_2 = mutual_info(X, y, discrete_features=False, random_state=0)`
			`mi_3 = mutual_info(X_csr, y, discrete_features="auto", random_state=0)`
			`mi_4 = mutual_info(X_csr, y, discrete_features=True, random_state=0)`
			`mi_5 = mutual_info(X, y, discrete_features=[True, False, True], random_state=0)`
			`mi_6 = mutual_info(X, y, discrete_features=[0, 2], random_state=0)`

			`assert_allclose(mi_1, mi_2)`
			`assert_allclose(mi_3, mi_4)`
			`assert_allclose(mi_5, mi_6)`

			`assert not np.allclose(mi_1, mi_3)`


			`@pytest.mark.parametrize("correlated", [True, False])`
			`def test_mutual_information_symmetry_classif_regression(correlated, global_random_seed):`
			"""Check that `mutual_info_classif` and `mutual_info_regression` are
			symmetric by switching the target `y` as `feature` in `X` and vice
			`versa.`

			`Non-regression test for:`
			`https://github.com/scikit-learn/scikit-learn/issues/23720`
			`"""`
			`rng = np.random.RandomState(global_random_seed)`
			`n = 100`
			`d = rng.randint(10, size=n)`

			`if correlated:`
			`c = d.astype(np.float64)`
			`else:`
			`c = rng.normal(0, 1, size=n)`

			`mi_classif = mutual_info_classif(`
			`c[:, None], d, discrete_features=[False], random_state=global_random_seed`
			`)`

			`mi_regression = mutual_info_regression(`
			`d[:, None], c, discrete_features=[True], random_state=global_random_seed`
			`)`

			`assert mi_classif == pytest.approx(mi_regression)`


			`def test_mutual_info_regression_X_int_dtype(global_random_seed):`
			`"""Check that results agree when X is integer dtype and float dtype.`

			`Non-regression test for Issue #26696.`
			`"""`
			`rng = np.random.RandomState(global_random_seed)`
			`X = rng.randint(100, size=(100, 10))`
			`X_float = X.astype(np.float64, copy=True)`
			`y = rng.randint(100, size=100)`

			`expected = mutual_info_regression(X_float, y, random_state=global_random_seed)`
			`result = mutual_info_regression(X, y, random_state=global_random_seed)`
			`assert_allclose(result, expected)`


			`@pytest.mark.parametrize(`
			`"mutual_info_func, data_generator",`
			`[`
			`(mutual_info_regression, make_regression),`
			`(mutual_info_classif, make_classification),`
			`],`
			`)`
			`def test_mutual_info_n_jobs(global_random_seed, mutual_info_func, data_generator):`
			"""Check that results are consistent with different `n_jobs`."""
			`X, y = data_generator(random_state=global_random_seed)`
			`single_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=1)`
			`multi_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=2)`
			`assert_allclose(single_job, multi_job)`