268 lines
8.9 KiB
Python
268 lines
8.9 KiB
Python
|
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
from numbers import Real
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from ..base import OutlierMixin, _fit_context
|
||
|
from ..metrics import accuracy_score
|
||
|
from ..utils._param_validation import Interval
|
||
|
from ..utils.validation import check_is_fitted
|
||
|
from ._robust_covariance import MinCovDet
|
||
|
|
||
|
|
||
|
class EllipticEnvelope(OutlierMixin, MinCovDet):
|
||
|
"""An object for detecting outliers in a Gaussian distributed dataset.
|
||
|
|
||
|
Read more in the :ref:`User Guide <outlier_detection>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
store_precision : bool, default=True
|
||
|
Specify if the estimated precision is stored.
|
||
|
|
||
|
assume_centered : bool, default=False
|
||
|
If True, the support of robust location and covariance estimates
|
||
|
is computed, and a covariance estimate is recomputed from it,
|
||
|
without centering the data.
|
||
|
Useful to work with data whose mean is significantly equal to
|
||
|
zero but is not exactly zero.
|
||
|
If False, the robust location and covariance are directly computed
|
||
|
with the FastMCD algorithm without additional treatment.
|
||
|
|
||
|
support_fraction : float, default=None
|
||
|
The proportion of points to be included in the support of the raw
|
||
|
MCD estimate. If None, the minimum value of support_fraction will
|
||
|
be used within the algorithm: `(n_samples + n_features + 1) / 2 * n_samples`.
|
||
|
Range is (0, 1).
|
||
|
|
||
|
contamination : float, default=0.1
|
||
|
The amount of contamination of the data set, i.e. the proportion
|
||
|
of outliers in the data set. Range is (0, 0.5].
|
||
|
|
||
|
random_state : int, RandomState instance or None, default=None
|
||
|
Determines the pseudo random number generator for shuffling
|
||
|
the data. Pass an int for reproducible results across multiple function
|
||
|
calls. See :term:`Glossary <random_state>`.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
location_ : ndarray of shape (n_features,)
|
||
|
Estimated robust location.
|
||
|
|
||
|
covariance_ : ndarray of shape (n_features, n_features)
|
||
|
Estimated robust covariance matrix.
|
||
|
|
||
|
precision_ : ndarray of shape (n_features, n_features)
|
||
|
Estimated pseudo inverse matrix.
|
||
|
(stored only if store_precision is True)
|
||
|
|
||
|
support_ : ndarray of shape (n_samples,)
|
||
|
A mask of the observations that have been used to compute the
|
||
|
robust estimates of location and shape.
|
||
|
|
||
|
offset_ : float
|
||
|
Offset used to define the decision function from the raw scores.
|
||
|
We have the relation: ``decision_function = score_samples - offset_``.
|
||
|
The offset depends on the contamination parameter and is defined in
|
||
|
such a way we obtain the expected number of outliers (samples with
|
||
|
decision function < 0) in training.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
|
||
|
raw_location_ : ndarray of shape (n_features,)
|
||
|
The raw robust estimated location before correction and re-weighting.
|
||
|
|
||
|
raw_covariance_ : ndarray of shape (n_features, n_features)
|
||
|
The raw robust estimated covariance before correction and re-weighting.
|
||
|
|
||
|
raw_support_ : ndarray of shape (n_samples,)
|
||
|
A mask of the observations that have been used to compute
|
||
|
the raw robust estimates of location and shape, before correction
|
||
|
and re-weighting.
|
||
|
|
||
|
dist_ : ndarray of shape (n_samples,)
|
||
|
Mahalanobis distances of the training set (on which :meth:`fit` is
|
||
|
called) observations.
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
EmpiricalCovariance : Maximum likelihood covariance estimator.
|
||
|
GraphicalLasso : Sparse inverse covariance estimation
|
||
|
with an l1-penalized estimator.
|
||
|
LedoitWolf : LedoitWolf Estimator.
|
||
|
MinCovDet : Minimum Covariance Determinant
|
||
|
(robust estimator of covariance).
|
||
|
OAS : Oracle Approximating Shrinkage Estimator.
|
||
|
ShrunkCovariance : Covariance estimator with shrinkage.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Outlier detection from covariance estimation may break or not
|
||
|
perform well in high-dimensional settings. In particular, one will
|
||
|
always take care to work with ``n_samples > n_features ** 2``.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the
|
||
|
minimum covariance determinant estimator" Technometrics 41(3), 212
|
||
|
(1999)
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn.covariance import EllipticEnvelope
|
||
|
>>> true_cov = np.array([[.8, .3],
|
||
|
... [.3, .4]])
|
||
|
>>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0],
|
||
|
... cov=true_cov,
|
||
|
... size=500)
|
||
|
>>> cov = EllipticEnvelope(random_state=0).fit(X)
|
||
|
>>> # predict returns 1 for an inlier and -1 for an outlier
|
||
|
>>> cov.predict([[0, 0],
|
||
|
... [3, 3]])
|
||
|
array([ 1, -1])
|
||
|
>>> cov.covariance_
|
||
|
array([[0.7411..., 0.2535...],
|
||
|
[0.2535..., 0.3053...]])
|
||
|
>>> cov.location_
|
||
|
array([0.0813... , 0.0427...])
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
**MinCovDet._parameter_constraints,
|
||
|
"contamination": [Interval(Real, 0, 0.5, closed="right")],
|
||
|
}
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
store_precision=True,
|
||
|
assume_centered=False,
|
||
|
support_fraction=None,
|
||
|
contamination=0.1,
|
||
|
random_state=None,
|
||
|
):
|
||
|
super().__init__(
|
||
|
store_precision=store_precision,
|
||
|
assume_centered=assume_centered,
|
||
|
support_fraction=support_fraction,
|
||
|
random_state=random_state,
|
||
|
)
|
||
|
self.contamination = contamination
|
||
|
|
||
|
@_fit_context(prefer_skip_nested_validation=True)
|
||
|
def fit(self, X, y=None):
|
||
|
"""Fit the EllipticEnvelope model.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Training data.
|
||
|
|
||
|
y : Ignored
|
||
|
Not used, present for API consistency by convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : object
|
||
|
Returns the instance itself.
|
||
|
"""
|
||
|
super().fit(X)
|
||
|
self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination)
|
||
|
return self
|
||
|
|
||
|
def decision_function(self, X):
|
||
|
"""Compute the decision function of the given observations.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
The data matrix.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
decision : ndarray of shape (n_samples,)
|
||
|
Decision function of the samples.
|
||
|
It is equal to the shifted Mahalanobis distances.
|
||
|
The threshold for being an outlier is 0, which ensures a
|
||
|
compatibility with other outlier detection algorithms.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
negative_mahal_dist = self.score_samples(X)
|
||
|
return negative_mahal_dist - self.offset_
|
||
|
|
||
|
def score_samples(self, X):
|
||
|
"""Compute the negative Mahalanobis distances.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
The data matrix.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
negative_mahal_distances : array-like of shape (n_samples,)
|
||
|
Opposite of the Mahalanobis distances.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
return -self.mahalanobis(X)
|
||
|
|
||
|
def predict(self, X):
|
||
|
"""
|
||
|
Predict labels (1 inlier, -1 outlier) of X according to fitted model.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
The data matrix.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
is_inlier : ndarray of shape (n_samples,)
|
||
|
Returns -1 for anomalies/outliers and +1 for inliers.
|
||
|
"""
|
||
|
values = self.decision_function(X)
|
||
|
is_inlier = np.full(values.shape[0], -1, dtype=int)
|
||
|
is_inlier[values >= 0] = 1
|
||
|
|
||
|
return is_inlier
|
||
|
|
||
|
def score(self, X, y, sample_weight=None):
|
||
|
"""Return the mean accuracy on the given test data and labels.
|
||
|
|
||
|
In multi-label classification, this is the subset accuracy
|
||
|
which is a harsh metric since you require for each sample that
|
||
|
each label set be correctly predicted.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Test samples.
|
||
|
|
||
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||
|
True labels for X.
|
||
|
|
||
|
sample_weight : array-like of shape (n_samples,), default=None
|
||
|
Sample weights.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
score : float
|
||
|
Mean accuracy of self.predict(X) w.r.t. y.
|
||
|
"""
|
||
|
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|