349 lines
11 KiB
Python
349 lines
11 KiB
Python
"""
|
|
Maximum likelihood covariance estimator.
|
|
|
|
"""
|
|
|
|
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
|
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
|
# Virgile Fritsch <virgile.fritsch@inria.fr>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
# avoid division truncation
|
|
import warnings
|
|
import numpy as np
|
|
from scipy import linalg
|
|
|
|
from .. import config_context
|
|
from ..base import BaseEstimator
|
|
from ..utils import check_array
|
|
from ..utils.extmath import fast_logdet
|
|
from ..metrics.pairwise import pairwise_distances
|
|
|
|
|
|
def log_likelihood(emp_cov, precision):
|
|
"""Compute the sample mean of the log_likelihood under a covariance model.
|
|
|
|
Computes the empirical expected log-likelihood, allowing for universal
|
|
comparison (beyond this software package), and accounts for normalization
|
|
terms and scaling.
|
|
|
|
Parameters
|
|
----------
|
|
emp_cov : ndarray of shape (n_features, n_features)
|
|
Maximum Likelihood Estimator of covariance.
|
|
|
|
precision : ndarray of shape (n_features, n_features)
|
|
The precision matrix of the covariance model to be tested.
|
|
|
|
Returns
|
|
-------
|
|
log_likelihood_ : float
|
|
Sample mean of the log-likelihood.
|
|
"""
|
|
p = precision.shape[0]
|
|
log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision)
|
|
log_likelihood_ -= p * np.log(2 * np.pi)
|
|
log_likelihood_ /= 2.0
|
|
return log_likelihood_
|
|
|
|
|
|
def empirical_covariance(X, *, assume_centered=False):
|
|
"""Compute the Maximum likelihood covariance estimator.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples, n_features)
|
|
Data from which to compute the covariance estimate.
|
|
|
|
assume_centered : bool, default=False
|
|
If `True`, data will not be centered before computation.
|
|
Useful when working with data whose mean is almost, but not exactly
|
|
zero.
|
|
If `False`, data will be centered before computation.
|
|
|
|
Returns
|
|
-------
|
|
covariance : ndarray of shape (n_features, n_features)
|
|
Empirical covariance (Maximum Likelihood Estimator).
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.covariance import empirical_covariance
|
|
>>> X = [[1,1,1],[1,1,1],[1,1,1],
|
|
... [0,0,0],[0,0,0],[0,0,0]]
|
|
>>> empirical_covariance(X)
|
|
array([[0.25, 0.25, 0.25],
|
|
[0.25, 0.25, 0.25],
|
|
[0.25, 0.25, 0.25]])
|
|
"""
|
|
X = np.asarray(X)
|
|
|
|
if X.ndim == 1:
|
|
X = np.reshape(X, (1, -1))
|
|
|
|
if X.shape[0] == 1:
|
|
warnings.warn(
|
|
"Only one sample available. You may want to reshape your data array"
|
|
)
|
|
|
|
if assume_centered:
|
|
covariance = np.dot(X.T, X) / X.shape[0]
|
|
else:
|
|
covariance = np.cov(X.T, bias=1)
|
|
|
|
if covariance.ndim == 0:
|
|
covariance = np.array([[covariance]])
|
|
return covariance
|
|
|
|
|
|
class EmpiricalCovariance(BaseEstimator):
|
|
"""Maximum likelihood covariance estimator.
|
|
|
|
Read more in the :ref:`User Guide <covariance>`.
|
|
|
|
Parameters
|
|
----------
|
|
store_precision : bool, default=True
|
|
Specifies if the estimated precision is stored.
|
|
|
|
assume_centered : bool, default=False
|
|
If True, data are not centered before computation.
|
|
Useful when working with data whose mean is almost, but not exactly
|
|
zero.
|
|
If False (default), data are centered before computation.
|
|
|
|
Attributes
|
|
----------
|
|
location_ : ndarray of shape (n_features,)
|
|
Estimated location, i.e. the estimated mean.
|
|
|
|
covariance_ : ndarray of shape (n_features, n_features)
|
|
Estimated covariance matrix
|
|
|
|
precision_ : ndarray of shape (n_features, n_features)
|
|
Estimated pseudo-inverse matrix.
|
|
(stored only if store_precision is True)
|
|
|
|
n_features_in_ : int
|
|
Number of features seen during :term:`fit`.
|
|
|
|
.. versionadded:: 0.24
|
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
|
Names of features seen during :term:`fit`. Defined only when `X`
|
|
has feature names that are all strings.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
See Also
|
|
--------
|
|
EllipticEnvelope : An object for detecting outliers in
|
|
a Gaussian distributed dataset.
|
|
GraphicalLasso : Sparse inverse covariance estimation
|
|
with an l1-penalized estimator.
|
|
LedoitWolf : LedoitWolf Estimator.
|
|
MinCovDet : Minimum Covariance Determinant
|
|
(robust estimator of covariance).
|
|
OAS : Oracle Approximating Shrinkage Estimator.
|
|
ShrunkCovariance : Covariance estimator with shrinkage.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.covariance import EmpiricalCovariance
|
|
>>> from sklearn.datasets import make_gaussian_quantiles
|
|
>>> real_cov = np.array([[.8, .3],
|
|
... [.3, .4]])
|
|
>>> rng = np.random.RandomState(0)
|
|
>>> X = rng.multivariate_normal(mean=[0, 0],
|
|
... cov=real_cov,
|
|
... size=500)
|
|
>>> cov = EmpiricalCovariance().fit(X)
|
|
>>> cov.covariance_
|
|
array([[0.7569..., 0.2818...],
|
|
[0.2818..., 0.3928...]])
|
|
>>> cov.location_
|
|
array([0.0622..., 0.0193...])
|
|
"""
|
|
|
|
_parameter_constraints: dict = {
|
|
"store_precision": ["boolean"],
|
|
"assume_centered": ["boolean"],
|
|
}
|
|
|
|
def __init__(self, *, store_precision=True, assume_centered=False):
|
|
self.store_precision = store_precision
|
|
self.assume_centered = assume_centered
|
|
|
|
def _set_covariance(self, covariance):
|
|
"""Saves the covariance and precision estimates
|
|
|
|
Storage is done accordingly to `self.store_precision`.
|
|
Precision stored only if invertible.
|
|
|
|
Parameters
|
|
----------
|
|
covariance : array-like of shape (n_features, n_features)
|
|
Estimated covariance matrix to be stored, and from which precision
|
|
is computed.
|
|
"""
|
|
covariance = check_array(covariance)
|
|
# set covariance
|
|
self.covariance_ = covariance
|
|
# set precision
|
|
if self.store_precision:
|
|
self.precision_ = linalg.pinvh(covariance, check_finite=False)
|
|
else:
|
|
self.precision_ = None
|
|
|
|
def get_precision(self):
|
|
"""Getter for the precision matrix.
|
|
|
|
Returns
|
|
-------
|
|
precision_ : array-like of shape (n_features, n_features)
|
|
The precision matrix associated to the current covariance object.
|
|
"""
|
|
if self.store_precision:
|
|
precision = self.precision_
|
|
else:
|
|
precision = linalg.pinvh(self.covariance_, check_finite=False)
|
|
return precision
|
|
|
|
def fit(self, X, y=None):
|
|
"""Fit the maximum likelihood covariance estimator to X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
Training data, where `n_samples` is the number of samples and
|
|
`n_features` is the number of features.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns the instance itself.
|
|
"""
|
|
self._validate_params()
|
|
X = self._validate_data(X)
|
|
if self.assume_centered:
|
|
self.location_ = np.zeros(X.shape[1])
|
|
else:
|
|
self.location_ = X.mean(0)
|
|
covariance = empirical_covariance(X, assume_centered=self.assume_centered)
|
|
self._set_covariance(covariance)
|
|
|
|
return self
|
|
|
|
def score(self, X_test, y=None):
|
|
"""Compute the log-likelihood of `X_test` under the estimated Gaussian model.
|
|
|
|
The Gaussian model is defined by its mean and covariance matrix which are
|
|
represented respectively by `self.location_` and `self.covariance_`.
|
|
|
|
Parameters
|
|
----------
|
|
X_test : array-like of shape (n_samples, n_features)
|
|
Test data of which we compute the likelihood, where `n_samples` is
|
|
the number of samples and `n_features` is the number of features.
|
|
`X_test` is assumed to be drawn from the same distribution than
|
|
the data used in fit (including centering).
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
Returns
|
|
-------
|
|
res : float
|
|
The log-likelihood of `X_test` with `self.location_` and `self.covariance_`
|
|
as estimators of the Gaussian model mean and covariance matrix respectively.
|
|
"""
|
|
X_test = self._validate_data(X_test, reset=False)
|
|
# compute empirical covariance of the test set
|
|
test_cov = empirical_covariance(X_test - self.location_, assume_centered=True)
|
|
# compute log likelihood
|
|
res = log_likelihood(test_cov, self.get_precision())
|
|
|
|
return res
|
|
|
|
def error_norm(self, comp_cov, norm="frobenius", scaling=True, squared=True):
|
|
"""Compute the Mean Squared Error between two covariance estimators.
|
|
|
|
Parameters
|
|
----------
|
|
comp_cov : array-like of shape (n_features, n_features)
|
|
The covariance to compare with.
|
|
|
|
norm : {"frobenius", "spectral"}, default="frobenius"
|
|
The type of norm used to compute the error. Available error types:
|
|
- 'frobenius' (default): sqrt(tr(A^t.A))
|
|
- 'spectral': sqrt(max(eigenvalues(A^t.A))
|
|
where A is the error ``(comp_cov - self.covariance_)``.
|
|
|
|
scaling : bool, default=True
|
|
If True (default), the squared error norm is divided by n_features.
|
|
If False, the squared error norm is not rescaled.
|
|
|
|
squared : bool, default=True
|
|
Whether to compute the squared error norm or the error norm.
|
|
If True (default), the squared error norm is returned.
|
|
If False, the error norm is returned.
|
|
|
|
Returns
|
|
-------
|
|
result : float
|
|
The Mean Squared Error (in the sense of the Frobenius norm) between
|
|
`self` and `comp_cov` covariance estimators.
|
|
"""
|
|
# compute the error
|
|
error = comp_cov - self.covariance_
|
|
# compute the error norm
|
|
if norm == "frobenius":
|
|
squared_norm = np.sum(error**2)
|
|
elif norm == "spectral":
|
|
squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))
|
|
else:
|
|
raise NotImplementedError(
|
|
"Only spectral and frobenius norms are implemented"
|
|
)
|
|
# optionally scale the error norm
|
|
if scaling:
|
|
squared_norm = squared_norm / error.shape[0]
|
|
# finally get either the squared norm or the norm
|
|
if squared:
|
|
result = squared_norm
|
|
else:
|
|
result = np.sqrt(squared_norm)
|
|
|
|
return result
|
|
|
|
def mahalanobis(self, X):
|
|
"""Compute the squared Mahalanobis distances of given observations.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
The observations, the Mahalanobis distances of the which we
|
|
compute. Observations are assumed to be drawn from the same
|
|
distribution than the data used in fit.
|
|
|
|
Returns
|
|
-------
|
|
dist : ndarray of shape (n_samples,)
|
|
Squared Mahalanobis distances of the observations.
|
|
"""
|
|
X = self._validate_data(X, reset=False)
|
|
|
|
precision = self.get_precision()
|
|
with config_context(assume_finite=True):
|
|
# compute mahalanobis distances
|
|
dist = pairwise_distances(
|
|
X, self.location_[np.newaxis, :], metric="mahalanobis", VI=precision
|
|
)
|
|
|
|
return np.reshape(dist, (len(X),)) ** 2
|