3RNN/Lib/site-packages/sklearn/linear_model/_ransac.py
2024-05-26 19:49:15 +02:00

727 lines
25 KiB
Python

# Author: Johannes Schönberger
#
# License: BSD 3 clause
import warnings
from numbers import Integral, Real
import numpy as np
from ..base import (
BaseEstimator,
MetaEstimatorMixin,
MultiOutputMixin,
RegressorMixin,
_fit_context,
clone,
)
from ..exceptions import ConvergenceWarning
from ..utils import check_consistent_length, check_random_state
from ..utils._bunch import Bunch
from ..utils._param_validation import (
HasMethods,
Interval,
Options,
RealNotInt,
StrOptions,
)
from ..utils.metadata_routing import (
MetadataRouter,
MethodMapping,
_raise_for_params,
_routing_enabled,
process_routing,
)
from ..utils.random import sample_without_replacement
from ..utils.validation import (
_check_method_params,
_check_sample_weight,
_deprecate_positional_args,
check_is_fitted,
has_fit_parameter,
)
from ._base import LinearRegression
_EPSILON = np.spacing(1)
def _dynamic_max_trials(n_inliers, n_samples, min_samples, probability):
"""Determine number trials such that at least one outlier-free subset is
sampled for the given inlier/outlier ratio.
Parameters
----------
n_inliers : int
Number of inliers in the data.
n_samples : int
Total number of samples in the data.
min_samples : int
Minimum number of samples chosen randomly from original data.
probability : float
Probability (confidence) that one outlier-free sample is generated.
Returns
-------
trials : int
Number of trials.
"""
inlier_ratio = n_inliers / float(n_samples)
nom = max(_EPSILON, 1 - probability)
denom = max(_EPSILON, 1 - inlier_ratio**min_samples)
if nom == 1:
return 0
if denom == 1:
return float("inf")
return abs(float(np.ceil(np.log(nom) / np.log(denom))))
class RANSACRegressor(
MetaEstimatorMixin,
RegressorMixin,
MultiOutputMixin,
BaseEstimator,
):
"""RANSAC (RANdom SAmple Consensus) algorithm.
RANSAC is an iterative algorithm for the robust estimation of parameters
from a subset of inliers from the complete data set.
Read more in the :ref:`User Guide <ransac_regression>`.
Parameters
----------
estimator : object, default=None
Base estimator object which implements the following methods:
* `fit(X, y)`: Fit model to given training data and target values.
* `score(X, y)`: Returns the mean accuracy on the given test data,
which is used for the stop criterion defined by `stop_score`.
Additionally, the score is used to decide which of two equally
large consensus sets is chosen as the better one.
* `predict(X)`: Returns predicted values using the linear model,
which is used to compute residual error using loss function.
If `estimator` is None, then
:class:`~sklearn.linear_model.LinearRegression` is used for
target values of dtype float.
Note that the current implementation only supports regression
estimators.
min_samples : int (>= 1) or float ([0, 1]), default=None
Minimum number of samples chosen randomly from original data. Treated
as an absolute number of samples for `min_samples >= 1`, treated as a
relative number `ceil(min_samples * X.shape[0])` for
`min_samples < 1`. This is typically chosen as the minimal number of
samples necessary to estimate the given `estimator`. By default a
:class:`~sklearn.linear_model.LinearRegression` estimator is assumed and
`min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly
dependent upon the model, so if a `estimator` other than
:class:`~sklearn.linear_model.LinearRegression` is used, the user must
provide a value.
residual_threshold : float, default=None
Maximum residual for a data sample to be classified as an inlier.
By default the threshold is chosen as the MAD (median absolute
deviation) of the target values `y`. Points whose residuals are
strictly equal to the threshold are considered as inliers.
is_data_valid : callable, default=None
This function is called with the randomly selected data before the
model is fitted to it: `is_data_valid(X, y)`. If its return value is
False the current randomly chosen sub-sample is skipped.
is_model_valid : callable, default=None
This function is called with the estimated model and the randomly
selected data: `is_model_valid(model, X, y)`. If its return value is
False the current randomly chosen sub-sample is skipped.
Rejecting samples with this function is computationally costlier than
with `is_data_valid`. `is_model_valid` should therefore only be used if
the estimated model is needed for making the rejection decision.
max_trials : int, default=100
Maximum number of iterations for random sample selection.
max_skips : int, default=np.inf
Maximum number of iterations that can be skipped due to finding zero
inliers or invalid data defined by ``is_data_valid`` or invalid models
defined by ``is_model_valid``.
.. versionadded:: 0.19
stop_n_inliers : int, default=np.inf
Stop iteration if at least this number of inliers are found.
stop_score : float, default=np.inf
Stop iteration if score is greater equal than this threshold.
stop_probability : float in range [0, 1], default=0.99
RANSAC iteration stops if at least one outlier-free set of the training
data is sampled in RANSAC. This requires to generate at least N
samples (iterations)::
N >= log(1 - probability) / log(1 - e**m)
where the probability (confidence) is typically set to high value such
as 0.99 (the default) and e is the current fraction of inliers w.r.t.
the total number of samples.
loss : str, callable, default='absolute_error'
String inputs, 'absolute_error' and 'squared_error' are supported which
find the absolute error and squared error per sample respectively.
If ``loss`` is a callable, then it should be a function that takes
two arrays as inputs, the true and predicted value and returns a 1-D
array with the i-th value of the array corresponding to the loss
on ``X[i]``.
If the loss on a sample is greater than the ``residual_threshold``,
then this sample is classified as an outlier.
.. versionadded:: 0.18
random_state : int, RandomState instance, default=None
The generator used to initialize the centers.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
Attributes
----------
estimator_ : object
Best fitted model (copy of the `estimator` object).
n_trials_ : int
Number of random selection trials until one of the stop criteria is
met. It is always ``<= max_trials``.
inlier_mask_ : bool array of shape [n_samples]
Boolean mask of inliers classified as ``True``.
n_skips_no_inliers_ : int
Number of iterations skipped due to finding zero inliers.
.. versionadded:: 0.19
n_skips_invalid_data_ : int
Number of iterations skipped due to invalid data defined by
``is_data_valid``.
.. versionadded:: 0.19
n_skips_invalid_model_ : int
Number of iterations skipped due to an invalid model defined by
``is_model_valid``.
.. versionadded:: 0.19
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
HuberRegressor : Linear regression model that is robust to outliers.
TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
References
----------
.. [1] https://en.wikipedia.org/wiki/RANSAC
.. [2] https://www.sri.com/wp-content/uploads/2021/12/ransac-publication.pdf
.. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf
Examples
--------
>>> from sklearn.linear_model import RANSACRegressor
>>> from sklearn.datasets import make_regression
>>> X, y = make_regression(
... n_samples=200, n_features=2, noise=4.0, random_state=0)
>>> reg = RANSACRegressor(random_state=0).fit(X, y)
>>> reg.score(X, y)
0.9885...
>>> reg.predict(X[:1,])
array([-31.9417...])
""" # noqa: E501
_parameter_constraints: dict = {
"estimator": [HasMethods(["fit", "score", "predict"]), None],
"min_samples": [
Interval(Integral, 1, None, closed="left"),
Interval(RealNotInt, 0, 1, closed="both"),
None,
],
"residual_threshold": [Interval(Real, 0, None, closed="left"), None],
"is_data_valid": [callable, None],
"is_model_valid": [callable, None],
"max_trials": [
Interval(Integral, 0, None, closed="left"),
Options(Real, {np.inf}),
],
"max_skips": [
Interval(Integral, 0, None, closed="left"),
Options(Real, {np.inf}),
],
"stop_n_inliers": [
Interval(Integral, 0, None, closed="left"),
Options(Real, {np.inf}),
],
"stop_score": [Interval(Real, None, None, closed="both")],
"stop_probability": [Interval(Real, 0, 1, closed="both")],
"loss": [StrOptions({"absolute_error", "squared_error"}), callable],
"random_state": ["random_state"],
}
def __init__(
self,
estimator=None,
*,
min_samples=None,
residual_threshold=None,
is_data_valid=None,
is_model_valid=None,
max_trials=100,
max_skips=np.inf,
stop_n_inliers=np.inf,
stop_score=np.inf,
stop_probability=0.99,
loss="absolute_error",
random_state=None,
):
self.estimator = estimator
self.min_samples = min_samples
self.residual_threshold = residual_threshold
self.is_data_valid = is_data_valid
self.is_model_valid = is_model_valid
self.max_trials = max_trials
self.max_skips = max_skips
self.stop_n_inliers = stop_n_inliers
self.stop_score = stop_score
self.stop_probability = stop_probability
self.random_state = random_state
self.loss = loss
@_fit_context(
# RansacRegressor.estimator is not validated yet
prefer_skip_nested_validation=False
)
# TODO(1.7): remove `sample_weight` from the signature after deprecation
# cycle; for backwards compatibility: pop it from `fit_params` before the
# `_raise_for_params` check and reinsert it after the check
@_deprecate_positional_args(version="1.7")
def fit(self, X, y, *, sample_weight=None, **fit_params):
"""Fit estimator using RANSAC algorithm.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,) or (n_samples, n_targets)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Individual weights for each sample
raises error if sample_weight is passed and estimator
fit method does not support it.
.. versionadded:: 0.18
**fit_params : dict
Parameters routed to the `fit` method of the sub-estimator via the
metadata routing API.
.. versionadded:: 1.5
Only available if
`sklearn.set_config(enable_metadata_routing=True)` is set. See
:ref:`Metadata Routing User Guide <metadata_routing>` for more
details.
Returns
-------
self : object
Fitted `RANSACRegressor` estimator.
Raises
------
ValueError
If no valid consensus set could be found. This occurs if
`is_data_valid` and `is_model_valid` return False for all
`max_trials` randomly chosen sub-samples.
"""
# Need to validate separately here. We can't pass multi_output=True
# because that would allow y to be csr. Delay expensive finiteness
# check to the estimator's own input validation.
_raise_for_params(fit_params, self, "fit")
check_X_params = dict(accept_sparse="csr", force_all_finite=False)
check_y_params = dict(ensure_2d=False)
X, y = self._validate_data(
X, y, validate_separately=(check_X_params, check_y_params)
)
check_consistent_length(X, y)
if self.estimator is not None:
estimator = clone(self.estimator)
else:
estimator = LinearRegression()
if self.min_samples is None:
if not isinstance(estimator, LinearRegression):
raise ValueError(
"`min_samples` needs to be explicitly set when estimator "
"is not a LinearRegression."
)
min_samples = X.shape[1] + 1
elif 0 < self.min_samples < 1:
min_samples = np.ceil(self.min_samples * X.shape[0])
elif self.min_samples >= 1:
min_samples = self.min_samples
if min_samples > X.shape[0]:
raise ValueError(
"`min_samples` may not be larger than number "
"of samples: n_samples = %d." % (X.shape[0])
)
if self.residual_threshold is None:
# MAD (median absolute deviation)
residual_threshold = np.median(np.abs(y - np.median(y)))
else:
residual_threshold = self.residual_threshold
if self.loss == "absolute_error":
if y.ndim == 1:
loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
else:
loss_function = lambda y_true, y_pred: np.sum(
np.abs(y_true - y_pred), axis=1
)
elif self.loss == "squared_error":
if y.ndim == 1:
loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2
else:
loss_function = lambda y_true, y_pred: np.sum(
(y_true - y_pred) ** 2, axis=1
)
elif callable(self.loss):
loss_function = self.loss
random_state = check_random_state(self.random_state)
try: # Not all estimator accept a random_state
estimator.set_params(random_state=random_state)
except ValueError:
pass
estimator_fit_has_sample_weight = has_fit_parameter(estimator, "sample_weight")
estimator_name = type(estimator).__name__
if sample_weight is not None and not estimator_fit_has_sample_weight:
raise ValueError(
"%s does not support sample_weight. Sample"
" weights are only used for the calibration"
" itself." % estimator_name
)
if sample_weight is not None:
fit_params["sample_weight"] = sample_weight
if _routing_enabled():
routed_params = process_routing(self, "fit", **fit_params)
else:
routed_params = Bunch()
routed_params.estimator = Bunch(fit={}, predict={}, score={})
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
routed_params.estimator.fit = {"sample_weight": sample_weight}
n_inliers_best = 1
score_best = -np.inf
inlier_mask_best = None
X_inlier_best = None
y_inlier_best = None
inlier_best_idxs_subset = None
self.n_skips_no_inliers_ = 0
self.n_skips_invalid_data_ = 0
self.n_skips_invalid_model_ = 0
# number of data samples
n_samples = X.shape[0]
sample_idxs = np.arange(n_samples)
self.n_trials_ = 0
max_trials = self.max_trials
while self.n_trials_ < max_trials:
self.n_trials_ += 1
if (
self.n_skips_no_inliers_
+ self.n_skips_invalid_data_
+ self.n_skips_invalid_model_
) > self.max_skips:
break
# choose random sample set
subset_idxs = sample_without_replacement(
n_samples, min_samples, random_state=random_state
)
X_subset = X[subset_idxs]
y_subset = y[subset_idxs]
# check if random sample set is valid
if self.is_data_valid is not None and not self.is_data_valid(
X_subset, y_subset
):
self.n_skips_invalid_data_ += 1
continue
# cut `fit_params` down to `subset_idxs`
fit_params_subset = _check_method_params(
X, params=routed_params.estimator.fit, indices=subset_idxs
)
# fit model for current random sample set
estimator.fit(X_subset, y_subset, **fit_params_subset)
# check if estimated model is valid
if self.is_model_valid is not None and not self.is_model_valid(
estimator, X_subset, y_subset
):
self.n_skips_invalid_model_ += 1
continue
# residuals of all data for current random sample model
y_pred = estimator.predict(X)
residuals_subset = loss_function(y, y_pred)
# classify data into inliers and outliers
inlier_mask_subset = residuals_subset <= residual_threshold
n_inliers_subset = np.sum(inlier_mask_subset)
# less inliers -> skip current random sample
if n_inliers_subset < n_inliers_best:
self.n_skips_no_inliers_ += 1
continue
# extract inlier data set
inlier_idxs_subset = sample_idxs[inlier_mask_subset]
X_inlier_subset = X[inlier_idxs_subset]
y_inlier_subset = y[inlier_idxs_subset]
# cut `fit_params` down to `inlier_idxs_subset`
score_params_inlier_subset = _check_method_params(
X, params=routed_params.estimator.score, indices=inlier_idxs_subset
)
# score of inlier data set
score_subset = estimator.score(
X_inlier_subset,
y_inlier_subset,
**score_params_inlier_subset,
)
# same number of inliers but worse score -> skip current random
# sample
if n_inliers_subset == n_inliers_best and score_subset < score_best:
continue
# save current random sample as best sample
n_inliers_best = n_inliers_subset
score_best = score_subset
inlier_mask_best = inlier_mask_subset
X_inlier_best = X_inlier_subset
y_inlier_best = y_inlier_subset
inlier_best_idxs_subset = inlier_idxs_subset
max_trials = min(
max_trials,
_dynamic_max_trials(
n_inliers_best, n_samples, min_samples, self.stop_probability
),
)
# break if sufficient number of inliers or score is reached
if n_inliers_best >= self.stop_n_inliers or score_best >= self.stop_score:
break
# if none of the iterations met the required criteria
if inlier_mask_best is None:
if (
self.n_skips_no_inliers_
+ self.n_skips_invalid_data_
+ self.n_skips_invalid_model_
) > self.max_skips:
raise ValueError(
"RANSAC skipped more iterations than `max_skips` without"
" finding a valid consensus set. Iterations were skipped"
" because each randomly chosen sub-sample failed the"
" passing criteria. See estimator attributes for"
" diagnostics (n_skips*)."
)
else:
raise ValueError(
"RANSAC could not find a valid consensus set. All"
" `max_trials` iterations were skipped because each"
" randomly chosen sub-sample failed the passing criteria."
" See estimator attributes for diagnostics (n_skips*)."
)
else:
if (
self.n_skips_no_inliers_
+ self.n_skips_invalid_data_
+ self.n_skips_invalid_model_
) > self.max_skips:
warnings.warn(
(
"RANSAC found a valid consensus set but exited"
" early due to skipping more iterations than"
" `max_skips`. See estimator attributes for"
" diagnostics (n_skips*)."
),
ConvergenceWarning,
)
# estimate final model using all inliers
fit_params_best_idxs_subset = _check_method_params(
X, params=routed_params.estimator.fit, indices=inlier_best_idxs_subset
)
estimator.fit(X_inlier_best, y_inlier_best, **fit_params_best_idxs_subset)
self.estimator_ = estimator
self.inlier_mask_ = inlier_mask_best
return self
def predict(self, X, **params):
"""Predict using the estimated model.
This is a wrapper for `estimator_.predict(X)`.
Parameters
----------
X : {array-like or sparse matrix} of shape (n_samples, n_features)
Input data.
**params : dict
Parameters routed to the `predict` method of the sub-estimator via
the metadata routing API.
.. versionadded:: 1.5
Only available if
`sklearn.set_config(enable_metadata_routing=True)` is set. See
:ref:`Metadata Routing User Guide <metadata_routing>` for more
details.
Returns
-------
y : array, shape = [n_samples] or [n_samples, n_targets]
Returns predicted values.
"""
check_is_fitted(self)
X = self._validate_data(
X,
force_all_finite=False,
accept_sparse=True,
reset=False,
)
_raise_for_params(params, self, "predict")
if _routing_enabled():
predict_params = process_routing(self, "predict", **params).estimator[
"predict"
]
else:
predict_params = {}
return self.estimator_.predict(X, **predict_params)
def score(self, X, y, **params):
"""Return the score of the prediction.
This is a wrapper for `estimator_.score(X, y)`.
Parameters
----------
X : (array-like or sparse matrix} of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,) or (n_samples, n_targets)
Target values.
**params : dict
Parameters routed to the `score` method of the sub-estimator via
the metadata routing API.
.. versionadded:: 1.5
Only available if
`sklearn.set_config(enable_metadata_routing=True)` is set. See
:ref:`Metadata Routing User Guide <metadata_routing>` for more
details.
Returns
-------
z : float
Score of the prediction.
"""
check_is_fitted(self)
X = self._validate_data(
X,
force_all_finite=False,
accept_sparse=True,
reset=False,
)
_raise_for_params(params, self, "score")
if _routing_enabled():
score_params = process_routing(self, "score", **params).estimator["score"]
else:
score_params = {}
return self.estimator_.score(X, y, **score_params)
def get_metadata_routing(self):
"""Get metadata routing of this object.
Please check :ref:`User Guide <metadata_routing>` on how the routing
mechanism works.
.. versionadded:: 1.5
Returns
-------
routing : MetadataRouter
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
routing information.
"""
router = MetadataRouter(owner=self.__class__.__name__).add(
estimator=self.estimator,
method_mapping=MethodMapping()
.add(caller="fit", callee="fit")
.add(caller="fit", callee="score")
.add(caller="score", callee="score")
.add(caller="predict", callee="predict"),
)
return router
def _more_tags(self):
return {
"_xfail_checks": {
"check_sample_weights_invariance": (
"zero sample_weight is not equivalent to removing samples"
),
}
}