546 lines
19 KiB
Python
546 lines
19 KiB
Python
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
|
|
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
|
# License: BSD 3 clause
|
|
|
|
import numbers
|
|
import numpy as np
|
|
from scipy.sparse import issparse
|
|
from warnings import warn
|
|
from numbers import Integral, Real
|
|
|
|
from ..tree import ExtraTreeRegressor
|
|
from ..tree._tree import DTYPE as tree_dtype
|
|
from ..utils import (
|
|
check_random_state,
|
|
check_array,
|
|
gen_batches,
|
|
get_chunk_n_rows,
|
|
)
|
|
from ..utils._param_validation import Interval, StrOptions
|
|
from ..utils.validation import check_is_fitted, _num_samples
|
|
from ..base import OutlierMixin
|
|
|
|
from ._bagging import BaseBagging
|
|
|
|
__all__ = ["IsolationForest"]
|
|
|
|
|
|
class IsolationForest(OutlierMixin, BaseBagging):
|
|
"""
|
|
Isolation Forest Algorithm.
|
|
|
|
Return the anomaly score of each sample using the IsolationForest algorithm
|
|
|
|
The IsolationForest 'isolates' observations by randomly selecting a feature
|
|
and then randomly selecting a split value between the maximum and minimum
|
|
values of the selected feature.
|
|
|
|
Since recursive partitioning can be represented by a tree structure, the
|
|
number of splittings required to isolate a sample is equivalent to the path
|
|
length from the root node to the terminating node.
|
|
|
|
This path length, averaged over a forest of such random trees, is a
|
|
measure of normality and our decision function.
|
|
|
|
Random partitioning produces noticeably shorter paths for anomalies.
|
|
Hence, when a forest of random trees collectively produce shorter path
|
|
lengths for particular samples, they are highly likely to be anomalies.
|
|
|
|
Read more in the :ref:`User Guide <isolation_forest>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : int, default=100
|
|
The number of base estimators in the ensemble.
|
|
|
|
max_samples : "auto", int or float, default="auto"
|
|
The number of samples to draw from X to train each base estimator.
|
|
- If int, then draw `max_samples` samples.
|
|
- If float, then draw `max_samples * X.shape[0]` samples.
|
|
- If "auto", then `max_samples=min(256, n_samples)`.
|
|
|
|
If max_samples is larger than the number of samples provided,
|
|
all samples will be used for all trees (no sampling).
|
|
|
|
contamination : 'auto' or float, default='auto'
|
|
The amount of contamination of the data set, i.e. the proportion
|
|
of outliers in the data set. Used when fitting to define the threshold
|
|
on the scores of the samples.
|
|
|
|
- If 'auto', the threshold is determined as in the
|
|
original paper.
|
|
- If float, the contamination should be in the range (0, 0.5].
|
|
|
|
.. versionchanged:: 0.22
|
|
The default value of ``contamination`` changed from 0.1
|
|
to ``'auto'``.
|
|
|
|
max_features : int or float, default=1.0
|
|
The number of features to draw from X to train each base estimator.
|
|
|
|
- If int, then draw `max_features` features.
|
|
- If float, then draw `max(1, int(max_features * n_features_in_))` features.
|
|
|
|
Note: using a float number less than 1.0 or integer less than number of
|
|
features will enable feature subsampling and leads to a longerr runtime.
|
|
|
|
bootstrap : bool, default=False
|
|
If True, individual trees are fit on random subsets of the training
|
|
data sampled with replacement. If False, sampling without replacement
|
|
is performed.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to run in parallel for both :meth:`fit` and
|
|
:meth:`predict`. ``None`` means 1 unless in a
|
|
:obj:`joblib.parallel_backend` context. ``-1`` means using all
|
|
processors. See :term:`Glossary <n_jobs>` for more details.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Controls the pseudo-randomness of the selection of the feature
|
|
and split values for each branching step and each tree in the forest.
|
|
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
verbose : int, default=0
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, default=False
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest. See :term:`the Glossary <warm_start>`.
|
|
|
|
.. versionadded:: 0.21
|
|
|
|
Attributes
|
|
----------
|
|
estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
|
|
The child estimator template used to create the collection of
|
|
fitted sub-estimators.
|
|
|
|
.. versionadded:: 1.2
|
|
`base_estimator_` was renamed to `estimator_`.
|
|
|
|
base_estimator_ : ExtraTreeRegressor instance
|
|
The child estimator template used to create the collection of
|
|
fitted sub-estimators.
|
|
|
|
.. deprecated:: 1.2
|
|
`base_estimator_` is deprecated and will be removed in 1.4.
|
|
Use `estimator_` instead.
|
|
|
|
estimators_ : list of ExtraTreeRegressor instances
|
|
The collection of fitted sub-estimators.
|
|
|
|
estimators_features_ : list of ndarray
|
|
The subset of drawn features for each base estimator.
|
|
|
|
estimators_samples_ : list of ndarray
|
|
The subset of drawn samples (i.e., the in-bag samples) for each base
|
|
estimator.
|
|
|
|
max_samples_ : int
|
|
The actual number of samples.
|
|
|
|
offset_ : float
|
|
Offset used to define the decision function from the raw scores. We
|
|
have the relation: ``decision_function = score_samples - offset_``.
|
|
``offset_`` is defined as follows. When the contamination parameter is
|
|
set to "auto", the offset is equal to -0.5 as the scores of inliers are
|
|
close to 0 and the scores of outliers are close to -1. When a
|
|
contamination parameter different than "auto" is provided, the offset
|
|
is defined in such a way we obtain the expected number of outliers
|
|
(samples with decision function < 0) in training.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
n_features_in_ : int
|
|
Number of features seen during :term:`fit`.
|
|
|
|
.. versionadded:: 0.24
|
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
|
Names of features seen during :term:`fit`. Defined only when `X`
|
|
has feature names that are all strings.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
See Also
|
|
--------
|
|
sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a
|
|
Gaussian distributed dataset.
|
|
sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.
|
|
Estimate the support of a high-dimensional distribution.
|
|
The implementation is based on libsvm.
|
|
sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection
|
|
using Local Outlier Factor (LOF).
|
|
|
|
Notes
|
|
-----
|
|
The implementation is based on an ensemble of ExtraTreeRegressor. The
|
|
maximum depth of each tree is set to ``ceil(log_2(n))`` where
|
|
:math:`n` is the number of samples used to build the tree
|
|
(see (Liu et al., 2008) for more details).
|
|
|
|
References
|
|
----------
|
|
.. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
|
|
Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
|
|
.. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based
|
|
anomaly detection." ACM Transactions on Knowledge Discovery from
|
|
Data (TKDD) 6.1 (2012): 3.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.ensemble import IsolationForest
|
|
>>> X = [[-1.1], [0.3], [0.5], [100]]
|
|
>>> clf = IsolationForest(random_state=0).fit(X)
|
|
>>> clf.predict([[0.1], [0], [90]])
|
|
array([ 1, 1, -1])
|
|
"""
|
|
|
|
_parameter_constraints: dict = {
|
|
"n_estimators": [Interval(Integral, 1, None, closed="left")],
|
|
"max_samples": [
|
|
StrOptions({"auto"}),
|
|
Interval(Integral, 1, None, closed="left"),
|
|
Interval(Real, 0, 1, closed="right"),
|
|
],
|
|
"contamination": [
|
|
StrOptions({"auto"}),
|
|
Interval(Real, 0, 0.5, closed="right"),
|
|
],
|
|
"max_features": [
|
|
Integral,
|
|
Interval(Real, 0, 1, closed="right"),
|
|
],
|
|
"bootstrap": ["boolean"],
|
|
"n_jobs": [Integral, None],
|
|
"random_state": ["random_state"],
|
|
"verbose": ["verbose"],
|
|
"warm_start": ["boolean"],
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
n_estimators=100,
|
|
max_samples="auto",
|
|
contamination="auto",
|
|
max_features=1.0,
|
|
bootstrap=False,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
):
|
|
super().__init__(
|
|
estimator=ExtraTreeRegressor(
|
|
max_features=1, splitter="random", random_state=random_state
|
|
),
|
|
# here above max_features has no links with self.max_features
|
|
bootstrap=bootstrap,
|
|
bootstrap_features=False,
|
|
n_estimators=n_estimators,
|
|
max_samples=max_samples,
|
|
max_features=max_features,
|
|
warm_start=warm_start,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
)
|
|
|
|
self.contamination = contamination
|
|
|
|
def _set_oob_score(self, X, y):
|
|
raise NotImplementedError("OOB score not supported by iforest")
|
|
|
|
def _parallel_args(self):
|
|
# ExtraTreeRegressor releases the GIL, so it's more efficient to use
|
|
# a thread-based backend rather than a process-based backend so as
|
|
# to avoid suffering from communication overhead and extra memory
|
|
# copies.
|
|
return {"prefer": "threads"}
|
|
|
|
def fit(self, X, y=None, sample_weight=None):
|
|
"""
|
|
Fit estimator.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Use ``dtype=np.float32`` for maximum
|
|
efficiency. Sparse matrices are also supported, use sparse
|
|
``csc_matrix`` for maximum efficiency.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
sample_weight : array-like of shape (n_samples,), default=None
|
|
Sample weights. If None, then samples are equally weighted.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Fitted estimator.
|
|
"""
|
|
self._validate_params()
|
|
X = self._validate_data(X, accept_sparse=["csc"], dtype=tree_dtype)
|
|
if issparse(X):
|
|
# Pre-sort indices to avoid that each individual tree of the
|
|
# ensemble sorts the indices.
|
|
X.sort_indices()
|
|
|
|
rnd = check_random_state(self.random_state)
|
|
y = rnd.uniform(size=X.shape[0])
|
|
|
|
# ensure that max_sample is in [1, n_samples]:
|
|
n_samples = X.shape[0]
|
|
|
|
if isinstance(self.max_samples, str) and self.max_samples == "auto":
|
|
max_samples = min(256, n_samples)
|
|
|
|
elif isinstance(self.max_samples, numbers.Integral):
|
|
if self.max_samples > n_samples:
|
|
warn(
|
|
"max_samples (%s) is greater than the "
|
|
"total number of samples (%s). max_samples "
|
|
"will be set to n_samples for estimation."
|
|
% (self.max_samples, n_samples)
|
|
)
|
|
max_samples = n_samples
|
|
else:
|
|
max_samples = self.max_samples
|
|
else: # max_samples is float
|
|
max_samples = int(self.max_samples * X.shape[0])
|
|
|
|
self.max_samples_ = max_samples
|
|
max_depth = int(np.ceil(np.log2(max(max_samples, 2))))
|
|
super()._fit(
|
|
X,
|
|
y,
|
|
max_samples,
|
|
max_depth=max_depth,
|
|
sample_weight=sample_weight,
|
|
check_input=False,
|
|
)
|
|
|
|
if self.contamination == "auto":
|
|
# 0.5 plays a special role as described in the original paper.
|
|
# we take the opposite as we consider the opposite of their score.
|
|
self.offset_ = -0.5
|
|
return self
|
|
|
|
# else, define offset_ wrt contamination parameter
|
|
self.offset_ = np.percentile(self.score_samples(X), 100.0 * self.contamination)
|
|
|
|
return self
|
|
|
|
def predict(self, X):
|
|
"""
|
|
Predict if a particular sample is an outlier or not.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Internally, it will be converted to
|
|
``dtype=np.float32`` and if a sparse matrix is provided
|
|
to a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
is_inlier : ndarray of shape (n_samples,)
|
|
For each observation, tells whether or not (+1 or -1) it should
|
|
be considered as an inlier according to the fitted model.
|
|
"""
|
|
check_is_fitted(self)
|
|
decision_func = self.decision_function(X)
|
|
is_inlier = np.ones_like(decision_func, dtype=int)
|
|
is_inlier[decision_func < 0] = -1
|
|
return is_inlier
|
|
|
|
def decision_function(self, X):
|
|
"""
|
|
Average anomaly score of X of the base classifiers.
|
|
|
|
The anomaly score of an input sample is computed as
|
|
the mean anomaly score of the trees in the forest.
|
|
|
|
The measure of normality of an observation given a tree is the depth
|
|
of the leaf containing this observation, which is equivalent to
|
|
the number of splittings required to isolate this point. In case of
|
|
several observations n_left in the leaf, the average path length of
|
|
a n_left samples isolation tree is added.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples. Internally, it will be converted to
|
|
``dtype=np.float32`` and if a sparse matrix is provided
|
|
to a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
scores : ndarray of shape (n_samples,)
|
|
The anomaly score of the input samples.
|
|
The lower, the more abnormal. Negative scores represent outliers,
|
|
positive scores represent inliers.
|
|
"""
|
|
# We subtract self.offset_ to make 0 be the threshold value for being
|
|
# an outlier:
|
|
|
|
return self.score_samples(X) - self.offset_
|
|
|
|
def score_samples(self, X):
|
|
"""
|
|
Opposite of the anomaly score defined in the original paper.
|
|
|
|
The anomaly score of an input sample is computed as
|
|
the mean anomaly score of the trees in the forest.
|
|
|
|
The measure of normality of an observation given a tree is the depth
|
|
of the leaf containing this observation, which is equivalent to
|
|
the number of splittings required to isolate this point. In case of
|
|
several observations n_left in the leaf, the average path length of
|
|
a n_left samples isolation tree is added.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
The input samples.
|
|
|
|
Returns
|
|
-------
|
|
scores : ndarray of shape (n_samples,)
|
|
The anomaly score of the input samples.
|
|
The lower, the more abnormal.
|
|
"""
|
|
# code structure from ForestClassifier/predict_proba
|
|
|
|
check_is_fitted(self)
|
|
|
|
# Check data
|
|
X = self._validate_data(X, accept_sparse="csr", reset=False)
|
|
|
|
# Take the opposite of the scores as bigger is better (here less
|
|
# abnormal)
|
|
return -self._compute_chunked_score_samples(X)
|
|
|
|
def _compute_chunked_score_samples(self, X):
|
|
|
|
n_samples = _num_samples(X)
|
|
|
|
if self._max_features == X.shape[1]:
|
|
subsample_features = False
|
|
else:
|
|
subsample_features = True
|
|
|
|
# We get as many rows as possible within our working_memory budget
|
|
# (defined by sklearn.get_config()['working_memory']) to store
|
|
# self._max_features in each row during computation.
|
|
#
|
|
# Note:
|
|
# - this will get at least 1 row, even if 1 row of score will
|
|
# exceed working_memory.
|
|
# - this does only account for temporary memory usage while loading
|
|
# the data needed to compute the scores -- the returned scores
|
|
# themselves are 1D.
|
|
|
|
chunk_n_rows = get_chunk_n_rows(
|
|
row_bytes=16 * self._max_features, max_n_rows=n_samples
|
|
)
|
|
slices = gen_batches(n_samples, chunk_n_rows)
|
|
|
|
scores = np.zeros(n_samples, order="f")
|
|
|
|
for sl in slices:
|
|
# compute score on the slices of test samples:
|
|
scores[sl] = self._compute_score_samples(X[sl], subsample_features)
|
|
|
|
return scores
|
|
|
|
def _compute_score_samples(self, X, subsample_features):
|
|
"""
|
|
Compute the score of each samples in X going through the extra trees.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix
|
|
Data matrix.
|
|
|
|
subsample_features : bool
|
|
Whether features should be subsampled.
|
|
"""
|
|
n_samples = X.shape[0]
|
|
|
|
depths = np.zeros(n_samples, order="f")
|
|
|
|
for tree, features in zip(self.estimators_, self.estimators_features_):
|
|
X_subset = X[:, features] if subsample_features else X
|
|
|
|
leaves_index = tree.apply(X_subset)
|
|
node_indicator = tree.decision_path(X_subset)
|
|
n_samples_leaf = tree.tree_.n_node_samples[leaves_index]
|
|
|
|
depths += (
|
|
np.ravel(node_indicator.sum(axis=1))
|
|
+ _average_path_length(n_samples_leaf)
|
|
- 1.0
|
|
)
|
|
denominator = len(self.estimators_) * _average_path_length([self.max_samples_])
|
|
scores = 2 ** (
|
|
# For a single training sample, denominator and depth are 0.
|
|
# Therefore, we set the score manually to 1.
|
|
-np.divide(
|
|
depths, denominator, out=np.ones_like(depths), where=denominator != 0
|
|
)
|
|
)
|
|
return scores
|
|
|
|
def _more_tags(self):
|
|
return {
|
|
"_xfail_checks": {
|
|
"check_sample_weights_invariance": (
|
|
"zero sample_weight is not equivalent to removing samples"
|
|
),
|
|
}
|
|
}
|
|
|
|
|
|
def _average_path_length(n_samples_leaf):
|
|
"""
|
|
The average path length in a n_samples iTree, which is equal to
|
|
the average path length of an unsuccessful BST search since the
|
|
latter has the same structure as an isolation tree.
|
|
Parameters
|
|
----------
|
|
n_samples_leaf : array-like of shape (n_samples,)
|
|
The number of training samples in each test sample leaf, for
|
|
each estimators.
|
|
|
|
Returns
|
|
-------
|
|
average_path_length : ndarray of shape (n_samples,)
|
|
"""
|
|
|
|
n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False)
|
|
|
|
n_samples_leaf_shape = n_samples_leaf.shape
|
|
n_samples_leaf = n_samples_leaf.reshape((1, -1))
|
|
average_path_length = np.zeros(n_samples_leaf.shape)
|
|
|
|
mask_1 = n_samples_leaf <= 1
|
|
mask_2 = n_samples_leaf == 2
|
|
not_mask = ~np.logical_or(mask_1, mask_2)
|
|
|
|
average_path_length[mask_1] = 0.0
|
|
average_path_length[mask_2] = 1.0
|
|
average_path_length[not_mask] = (
|
|
2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma)
|
|
- 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]
|
|
)
|
|
|
|
return average_path_length.reshape(n_samples_leaf_shape)
|