450 lines
16 KiB
Python
450 lines
16 KiB
Python
|
# Author: Nikolay Mayorov <n59_ru@hotmail.com>
|
||
|
# License: 3-clause BSD
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy.sparse import issparse
|
||
|
from scipy.special import digamma
|
||
|
|
||
|
from ..metrics.cluster import mutual_info_score
|
||
|
from ..neighbors import NearestNeighbors, KDTree
|
||
|
from ..preprocessing import scale
|
||
|
from ..utils import check_random_state
|
||
|
from ..utils.fixes import _astype_copy_false
|
||
|
from ..utils.validation import check_array, check_X_y
|
||
|
from ..utils.validation import _deprecate_positional_args
|
||
|
from ..utils.multiclass import check_classification_targets
|
||
|
|
||
|
|
||
|
def _compute_mi_cc(x, y, n_neighbors):
|
||
|
"""Compute mutual information between two continuous variables.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x, y : ndarray, shape (n_samples,)
|
||
|
Samples of two continuous random variables, must have an identical
|
||
|
shape.
|
||
|
|
||
|
n_neighbors : int
|
||
|
Number of nearest neighbors to search for each point, see [1]_.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mi : float
|
||
|
Estimated mutual information. If it turned out to be negative it is
|
||
|
replace by 0.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
True mutual information can't be negative. If its estimate by a numerical
|
||
|
method is negative, it means (providing the method is adequate) that the
|
||
|
mutual information is close to 0 and replacing it by 0 is a reasonable
|
||
|
strategy.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
|
||
|
information". Phys. Rev. E 69, 2004.
|
||
|
"""
|
||
|
n_samples = x.size
|
||
|
|
||
|
x = x.reshape((-1, 1))
|
||
|
y = y.reshape((-1, 1))
|
||
|
xy = np.hstack((x, y))
|
||
|
|
||
|
# Here we rely on NearestNeighbors to select the fastest algorithm.
|
||
|
nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors)
|
||
|
|
||
|
nn.fit(xy)
|
||
|
radius = nn.kneighbors()[0]
|
||
|
radius = np.nextafter(radius[:, -1], 0)
|
||
|
|
||
|
# KDTree is explicitly fit to allow for the querying of number of
|
||
|
# neighbors within a specified radius
|
||
|
kd = KDTree(x, metric='chebyshev')
|
||
|
nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
|
||
|
nx = np.array(nx) - 1.0
|
||
|
|
||
|
kd = KDTree(y, metric='chebyshev')
|
||
|
ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
|
||
|
ny = np.array(ny) - 1.0
|
||
|
|
||
|
mi = (digamma(n_samples) + digamma(n_neighbors) -
|
||
|
np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)))
|
||
|
|
||
|
return max(0, mi)
|
||
|
|
||
|
|
||
|
def _compute_mi_cd(c, d, n_neighbors):
|
||
|
"""Compute mutual information between continuous and discrete variables.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
c : ndarray, shape (n_samples,)
|
||
|
Samples of a continuous random variable.
|
||
|
|
||
|
d : ndarray, shape (n_samples,)
|
||
|
Samples of a discrete random variable.
|
||
|
|
||
|
n_neighbors : int
|
||
|
Number of nearest neighbors to search for each point, see [1]_.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mi : float
|
||
|
Estimated mutual information. If it turned out to be negative it is
|
||
|
replace by 0.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
True mutual information can't be negative. If its estimate by a numerical
|
||
|
method is negative, it means (providing the method is adequate) that the
|
||
|
mutual information is close to 0 and replacing it by 0 is a reasonable
|
||
|
strategy.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] B. C. Ross "Mutual Information between Discrete and Continuous
|
||
|
Data Sets". PLoS ONE 9(2), 2014.
|
||
|
"""
|
||
|
n_samples = c.shape[0]
|
||
|
c = c.reshape((-1, 1))
|
||
|
|
||
|
radius = np.empty(n_samples)
|
||
|
label_counts = np.empty(n_samples)
|
||
|
k_all = np.empty(n_samples)
|
||
|
nn = NearestNeighbors()
|
||
|
for label in np.unique(d):
|
||
|
mask = d == label
|
||
|
count = np.sum(mask)
|
||
|
if count > 1:
|
||
|
k = min(n_neighbors, count - 1)
|
||
|
nn.set_params(n_neighbors=k)
|
||
|
nn.fit(c[mask])
|
||
|
r = nn.kneighbors()[0]
|
||
|
radius[mask] = np.nextafter(r[:, -1], 0)
|
||
|
k_all[mask] = k
|
||
|
label_counts[mask] = count
|
||
|
|
||
|
# Ignore points with unique labels.
|
||
|
mask = label_counts > 1
|
||
|
n_samples = np.sum(mask)
|
||
|
label_counts = label_counts[mask]
|
||
|
k_all = k_all[mask]
|
||
|
c = c[mask]
|
||
|
radius = radius[mask]
|
||
|
|
||
|
kd = KDTree(c)
|
||
|
m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
|
||
|
m_all = np.array(m_all) - 1.0
|
||
|
|
||
|
mi = (digamma(n_samples) + np.mean(digamma(k_all)) -
|
||
|
np.mean(digamma(label_counts)) -
|
||
|
np.mean(digamma(m_all + 1)))
|
||
|
|
||
|
return max(0, mi)
|
||
|
|
||
|
|
||
|
def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):
|
||
|
"""Compute mutual information between two variables.
|
||
|
|
||
|
This is a simple wrapper which selects a proper function to call based on
|
||
|
whether `x` and `y` are discrete or not.
|
||
|
"""
|
||
|
if x_discrete and y_discrete:
|
||
|
return mutual_info_score(x, y)
|
||
|
elif x_discrete and not y_discrete:
|
||
|
return _compute_mi_cd(y, x, n_neighbors)
|
||
|
elif not x_discrete and y_discrete:
|
||
|
return _compute_mi_cd(x, y, n_neighbors)
|
||
|
else:
|
||
|
return _compute_mi_cc(x, y, n_neighbors)
|
||
|
|
||
|
|
||
|
def _iterate_columns(X, columns=None):
|
||
|
"""Iterate over columns of a matrix.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : ndarray or csc_matrix, shape (n_samples, n_features)
|
||
|
Matrix over which to iterate.
|
||
|
|
||
|
columns : iterable or None, default=None
|
||
|
Indices of columns to iterate over. If None, iterate over all columns.
|
||
|
|
||
|
Yields
|
||
|
------
|
||
|
x : ndarray, shape (n_samples,)
|
||
|
Columns of `X` in dense format.
|
||
|
"""
|
||
|
if columns is None:
|
||
|
columns = range(X.shape[1])
|
||
|
|
||
|
if issparse(X):
|
||
|
for i in columns:
|
||
|
x = np.zeros(X.shape[0])
|
||
|
start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
|
||
|
x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
|
||
|
yield x
|
||
|
else:
|
||
|
for i in columns:
|
||
|
yield X[:, i]
|
||
|
|
||
|
|
||
|
def _estimate_mi(X, y, discrete_features='auto', discrete_target=False,
|
||
|
n_neighbors=3, copy=True, random_state=None):
|
||
|
"""Estimate mutual information between the features and the target.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like or sparse matrix, shape (n_samples, n_features)
|
||
|
Feature matrix.
|
||
|
|
||
|
y : array-like of shape (n_samples,)
|
||
|
Target vector.
|
||
|
|
||
|
discrete_features : {'auto', bool, array-like}, default='auto'
|
||
|
If bool, then determines whether to consider all features discrete
|
||
|
or continuous. If array, then it should be either a boolean mask
|
||
|
with shape (n_features,) or array with indices of discrete features.
|
||
|
If 'auto', it is assigned to False for dense `X` and to True for
|
||
|
sparse `X`.
|
||
|
|
||
|
discrete_target : bool, default=False
|
||
|
Whether to consider `y` as a discrete variable.
|
||
|
|
||
|
n_neighbors : int, default=3
|
||
|
Number of neighbors to use for MI estimation for continuous variables,
|
||
|
see [1]_ and [2]_. Higher values reduce variance of the estimation, but
|
||
|
could introduce a bias.
|
||
|
|
||
|
copy : bool, default=True
|
||
|
Whether to make a copy of the given data. If set to False, the initial
|
||
|
data will be overwritten.
|
||
|
|
||
|
random_state : int, RandomState instance or None, default=None
|
||
|
Determines random number generation for adding small noise to
|
||
|
continuous variables in order to remove repeated values.
|
||
|
Pass an int for reproducible results across multiple function calls.
|
||
|
See :term:`Glossary <random_state>`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mi : ndarray, shape (n_features,)
|
||
|
Estimated mutual information between each feature and the target.
|
||
|
A negative value will be replaced by 0.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
|
||
|
information". Phys. Rev. E 69, 2004.
|
||
|
.. [2] B. C. Ross "Mutual Information between Discrete and Continuous
|
||
|
Data Sets". PLoS ONE 9(2), 2014.
|
||
|
"""
|
||
|
X, y = check_X_y(X, y, accept_sparse='csc', y_numeric=not discrete_target)
|
||
|
n_samples, n_features = X.shape
|
||
|
|
||
|
if isinstance(discrete_features, (str, bool)):
|
||
|
if isinstance(discrete_features, str):
|
||
|
if discrete_features == 'auto':
|
||
|
discrete_features = issparse(X)
|
||
|
else:
|
||
|
raise ValueError("Invalid string value for discrete_features.")
|
||
|
discrete_mask = np.empty(n_features, dtype=bool)
|
||
|
discrete_mask.fill(discrete_features)
|
||
|
else:
|
||
|
discrete_features = check_array(discrete_features, ensure_2d=False)
|
||
|
if discrete_features.dtype != 'bool':
|
||
|
discrete_mask = np.zeros(n_features, dtype=bool)
|
||
|
discrete_mask[discrete_features] = True
|
||
|
else:
|
||
|
discrete_mask = discrete_features
|
||
|
|
||
|
continuous_mask = ~discrete_mask
|
||
|
if np.any(continuous_mask) and issparse(X):
|
||
|
raise ValueError("Sparse matrix `X` can't have continuous features.")
|
||
|
|
||
|
rng = check_random_state(random_state)
|
||
|
if np.any(continuous_mask):
|
||
|
if copy:
|
||
|
X = X.copy()
|
||
|
|
||
|
if not discrete_target:
|
||
|
X[:, continuous_mask] = scale(X[:, continuous_mask],
|
||
|
with_mean=False, copy=False)
|
||
|
|
||
|
# Add small noise to continuous features as advised in Kraskov et. al.
|
||
|
X = X.astype(float, **_astype_copy_false(X))
|
||
|
means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
|
||
|
X[:, continuous_mask] += 1e-10 * means * rng.randn(
|
||
|
n_samples, np.sum(continuous_mask))
|
||
|
|
||
|
if not discrete_target:
|
||
|
y = scale(y, with_mean=False)
|
||
|
y += 1e-10 * np.maximum(1, np.mean(np.abs(y))) * rng.randn(n_samples)
|
||
|
|
||
|
mi = [_compute_mi(x, y, discrete_feature, discrete_target, n_neighbors) for
|
||
|
x, discrete_feature in zip(_iterate_columns(X), discrete_mask)]
|
||
|
|
||
|
return np.array(mi)
|
||
|
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def mutual_info_regression(X, y, *, discrete_features='auto', n_neighbors=3,
|
||
|
copy=True, random_state=None):
|
||
|
"""Estimate mutual information for a continuous target variable.
|
||
|
|
||
|
Mutual information (MI) [1]_ between two random variables is a non-negative
|
||
|
value, which measures the dependency between the variables. It is equal
|
||
|
to zero if and only if two random variables are independent, and higher
|
||
|
values mean higher dependency.
|
||
|
|
||
|
The function relies on nonparametric methods based on entropy estimation
|
||
|
from k-nearest neighbors distances as described in [2]_ and [3]_. Both
|
||
|
methods are based on the idea originally proposed in [4]_.
|
||
|
|
||
|
It can be used for univariate features selection, read more in the
|
||
|
:ref:`User Guide <univariate_feature_selection>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like or sparse matrix, shape (n_samples, n_features)
|
||
|
Feature matrix.
|
||
|
|
||
|
y : array-like of shape (n_samples,)
|
||
|
Target vector.
|
||
|
|
||
|
discrete_features : {'auto', bool, array-like}, default='auto'
|
||
|
If bool, then determines whether to consider all features discrete
|
||
|
or continuous. If array, then it should be either a boolean mask
|
||
|
with shape (n_features,) or array with indices of discrete features.
|
||
|
If 'auto', it is assigned to False for dense `X` and to True for
|
||
|
sparse `X`.
|
||
|
|
||
|
n_neighbors : int, default=3
|
||
|
Number of neighbors to use for MI estimation for continuous variables,
|
||
|
see [2]_ and [3]_. Higher values reduce variance of the estimation, but
|
||
|
could introduce a bias.
|
||
|
|
||
|
copy : bool, default=True
|
||
|
Whether to make a copy of the given data. If set to False, the initial
|
||
|
data will be overwritten.
|
||
|
|
||
|
random_state : int, RandomState instance or None, default=None
|
||
|
Determines random number generation for adding small noise to
|
||
|
continuous variables in order to remove repeated values.
|
||
|
Pass an int for reproducible results across multiple function calls.
|
||
|
See :term:`Glossary <random_state>`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mi : ndarray, shape (n_features,)
|
||
|
Estimated mutual information between each feature and the target.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
1. The term "discrete features" is used instead of naming them
|
||
|
"categorical", because it describes the essence more accurately.
|
||
|
For example, pixel intensities of an image are discrete features
|
||
|
(but hardly categorical) and you will get better results if mark them
|
||
|
as such. Also note, that treating a continuous variable as discrete and
|
||
|
vice versa will usually give incorrect results, so be attentive about
|
||
|
that.
|
||
|
2. True mutual information can't be negative. If its estimate turns out
|
||
|
to be negative, it is replaced by zero.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] `Mutual Information
|
||
|
<https://en.wikipedia.org/wiki/Mutual_information>`_
|
||
|
on Wikipedia.
|
||
|
.. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
|
||
|
information". Phys. Rev. E 69, 2004.
|
||
|
.. [3] B. C. Ross "Mutual Information between Discrete and Continuous
|
||
|
Data Sets". PLoS ONE 9(2), 2014.
|
||
|
.. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
|
||
|
of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
|
||
|
"""
|
||
|
return _estimate_mi(X, y, discrete_features, False, n_neighbors,
|
||
|
copy, random_state)
|
||
|
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def mutual_info_classif(X, y, *, discrete_features='auto', n_neighbors=3,
|
||
|
copy=True, random_state=None):
|
||
|
"""Estimate mutual information for a discrete target variable.
|
||
|
|
||
|
Mutual information (MI) [1]_ between two random variables is a non-negative
|
||
|
value, which measures the dependency between the variables. It is equal
|
||
|
to zero if and only if two random variables are independent, and higher
|
||
|
values mean higher dependency.
|
||
|
|
||
|
The function relies on nonparametric methods based on entropy estimation
|
||
|
from k-nearest neighbors distances as described in [2]_ and [3]_. Both
|
||
|
methods are based on the idea originally proposed in [4]_.
|
||
|
|
||
|
It can be used for univariate features selection, read more in the
|
||
|
:ref:`User Guide <univariate_feature_selection>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like or sparse matrix, shape (n_samples, n_features)
|
||
|
Feature matrix.
|
||
|
|
||
|
y : array-like of shape (n_samples,)
|
||
|
Target vector.
|
||
|
|
||
|
discrete_features : {'auto', bool, array-like}, default='auto'
|
||
|
If bool, then determines whether to consider all features discrete
|
||
|
or continuous. If array, then it should be either a boolean mask
|
||
|
with shape (n_features,) or array with indices of discrete features.
|
||
|
If 'auto', it is assigned to False for dense `X` and to True for
|
||
|
sparse `X`.
|
||
|
|
||
|
n_neighbors : int, default=3
|
||
|
Number of neighbors to use for MI estimation for continuous variables,
|
||
|
see [2]_ and [3]_. Higher values reduce variance of the estimation, but
|
||
|
could introduce a bias.
|
||
|
|
||
|
copy : bool, default=True
|
||
|
Whether to make a copy of the given data. If set to False, the initial
|
||
|
data will be overwritten.
|
||
|
|
||
|
random_state : int, RandomState instance or None, default=None
|
||
|
Determines random number generation for adding small noise to
|
||
|
continuous variables in order to remove repeated values.
|
||
|
Pass an int for reproducible results across multiple function calls.
|
||
|
See :term:`Glossary <random_state>`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mi : ndarray, shape (n_features,)
|
||
|
Estimated mutual information between each feature and the target.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
1. The term "discrete features" is used instead of naming them
|
||
|
"categorical", because it describes the essence more accurately.
|
||
|
For example, pixel intensities of an image are discrete features
|
||
|
(but hardly categorical) and you will get better results if mark them
|
||
|
as such. Also note, that treating a continuous variable as discrete and
|
||
|
vice versa will usually give incorrect results, so be attentive about
|
||
|
that.
|
||
|
2. True mutual information can't be negative. If its estimate turns out
|
||
|
to be negative, it is replaced by zero.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] `Mutual Information
|
||
|
<https://en.wikipedia.org/wiki/Mutual_information>`_
|
||
|
on Wikipedia.
|
||
|
.. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
|
||
|
information". Phys. Rev. E 69, 2004.
|
||
|
.. [3] B. C. Ross "Mutual Information between Discrete and Continuous
|
||
|
Data Sets". PLoS ONE 9(2), 2014.
|
||
|
.. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
|
||
|
of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
|
||
|
"""
|
||
|
check_classification_targets(y)
|
||
|
return _estimate_mi(X, y, discrete_features, True, n_neighbors,
|
||
|
copy, random_state)
|