Inzynierka/Lib/site-packages/sklearn/linear_model/_theil_sen.py
2023-06-02 12:51:02 +02:00

457 lines
15 KiB
Python

"""
A Theil-Sen Estimator for Multiple Linear Regression Model
"""
# Author: Florian Wilhelm <florian.wilhelm@gmail.com>
#
# License: BSD 3 clause
import warnings
from numbers import Integral, Real
from itertools import combinations
import numpy as np
from scipy import linalg
from scipy.special import binom
from scipy.linalg.lapack import get_lapack_funcs
from joblib import effective_n_jobs
from ._base import LinearModel
from ..base import RegressorMixin
from ..utils import check_random_state
from ..utils._param_validation import Interval
from ..utils.parallel import delayed, Parallel
from ..exceptions import ConvergenceWarning
_EPSILON = np.finfo(np.double).eps
def _modified_weiszfeld_step(X, x_old):
"""Modified Weiszfeld step.
This function defines one iteration step in order to approximate the
spatial median (L1 median). It is a form of an iteratively re-weighted
least squares method.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples and
`n_features` is the number of features.
x_old : ndarray of shape = (n_features,)
Current start vector.
Returns
-------
x_new : ndarray of shape (n_features,)
New iteration step.
References
----------
- On Computation of Spatial Median for Robust Data Mining, 2005
T. Kärkkäinen and S. Äyrämö
http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
"""
diff = X - x_old
diff_norm = np.sqrt(np.sum(diff**2, axis=1))
mask = diff_norm >= _EPSILON
# x_old equals one of our samples
is_x_old_in_X = int(mask.sum() < X.shape[0])
diff = diff[mask]
diff_norm = diff_norm[mask][:, np.newaxis]
quotient_norm = linalg.norm(np.sum(diff / diff_norm, axis=0))
if quotient_norm > _EPSILON: # to avoid division by zero
new_direction = np.sum(X[mask, :] / diff_norm, axis=0) / np.sum(
1 / diff_norm, axis=0
)
else:
new_direction = 1.0
quotient_norm = 1.0
return (
max(0.0, 1.0 - is_x_old_in_X / quotient_norm) * new_direction
+ min(1.0, is_x_old_in_X / quotient_norm) * x_old
)
def _spatial_median(X, max_iter=300, tol=1.0e-3):
"""Spatial median (L1 median).
The spatial median is member of a class of so-called M-estimators which
are defined by an optimization problem. Given a number of p points in an
n-dimensional space, the point x minimizing the sum of all distances to the
p other points is called spatial median.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples and
`n_features` is the number of features.
max_iter : int, default=300
Maximum number of iterations.
tol : float, default=1.e-3
Stop the algorithm if spatial_median has converged.
Returns
-------
spatial_median : ndarray of shape = (n_features,)
Spatial median.
n_iter : int
Number of iterations needed.
References
----------
- On Computation of Spatial Median for Robust Data Mining, 2005
T. Kärkkäinen and S. Äyrämö
http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
"""
if X.shape[1] == 1:
return 1, np.median(X.ravel(), keepdims=True)
tol **= 2 # We are computing the tol on the squared norm
spatial_median_old = np.mean(X, axis=0)
for n_iter in range(max_iter):
spatial_median = _modified_weiszfeld_step(X, spatial_median_old)
if np.sum((spatial_median_old - spatial_median) ** 2) < tol:
break
else:
spatial_median_old = spatial_median
else:
warnings.warn(
"Maximum number of iterations {max_iter} reached in "
"spatial median for TheilSen regressor."
"".format(max_iter=max_iter),
ConvergenceWarning,
)
return n_iter, spatial_median
def _breakdown_point(n_samples, n_subsamples):
"""Approximation of the breakdown point.
Parameters
----------
n_samples : int
Number of samples.
n_subsamples : int
Number of subsamples to consider.
Returns
-------
breakdown_point : float
Approximation of breakdown point.
"""
return (
1
- (
0.5 ** (1 / n_subsamples) * (n_samples - n_subsamples + 1)
+ n_subsamples
- 1
)
/ n_samples
)
def _lstsq(X, y, indices, fit_intercept):
"""Least Squares Estimator for TheilSenRegressor class.
This function calculates the least squares method on a subset of rows of X
and y defined by the indices array. Optionally, an intercept column is
added if intercept is set to true.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Design matrix, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : ndarray of shape (n_samples,)
Target vector, where `n_samples` is the number of samples.
indices : ndarray of shape (n_subpopulation, n_subsamples)
Indices of all subsamples with respect to the chosen subpopulation.
fit_intercept : bool
Fit intercept or not.
Returns
-------
weights : ndarray of shape (n_subpopulation, n_features + intercept)
Solution matrix of n_subpopulation solved least square problems.
"""
fit_intercept = int(fit_intercept)
n_features = X.shape[1] + fit_intercept
n_subsamples = indices.shape[1]
weights = np.empty((indices.shape[0], n_features))
X_subpopulation = np.ones((n_subsamples, n_features))
# gelss need to pad y_subpopulation to be of the max dim of X_subpopulation
y_subpopulation = np.zeros((max(n_subsamples, n_features)))
(lstsq,) = get_lapack_funcs(("gelss",), (X_subpopulation, y_subpopulation))
for index, subset in enumerate(indices):
X_subpopulation[:, fit_intercept:] = X[subset, :]
y_subpopulation[:n_subsamples] = y[subset]
weights[index] = lstsq(X_subpopulation, y_subpopulation)[1][:n_features]
return weights
class TheilSenRegressor(RegressorMixin, LinearModel):
"""Theil-Sen Estimator: robust multivariate regression model.
The algorithm calculates least square solutions on subsets with size
n_subsamples of the samples in X. Any value of n_subsamples between the
number of features and samples leads to an estimator with a compromise
between robustness and efficiency. Since the number of least square
solutions is "n_samples choose n_subsamples", it can be extremely large
and can therefore be limited with max_subpopulation. If this limit is
reached, the subsets are chosen randomly. In a final step, the spatial
median (or L1 median) is calculated of all least square solutions.
Read more in the :ref:`User Guide <theil_sen_regression>`.
Parameters
----------
fit_intercept : bool, default=True
Whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations.
copy_X : bool, default=True
If True, X will be copied; else, it may be overwritten.
max_subpopulation : int, default=1e4
Instead of computing with a set of cardinality 'n choose k', where n is
the number of samples and k is the number of subsamples (at least
number of features), consider only a stochastic subpopulation of a
given maximal size if 'n choose k' is larger than max_subpopulation.
For other than small problem sizes this parameter will determine
memory usage and runtime if n_subsamples is not changed. Note that the
data type should be int but floats such as 1e4 can be accepted too.
n_subsamples : int, default=None
Number of samples to calculate the parameters. This is at least the
number of features (plus 1 if fit_intercept=True) and the number of
samples as a maximum. A lower number leads to a higher breakdown
point and a low efficiency while a high number leads to a low
breakdown point and a high efficiency. If None, take the
minimum number of subsamples leading to maximal robustness.
If n_subsamples is set to n_samples, Theil-Sen is identical to least
squares.
max_iter : int, default=300
Maximum number of iterations for the calculation of spatial median.
tol : float, default=1e-3
Tolerance when calculating spatial median.
random_state : int, RandomState instance or None, default=None
A random number generator instance to define the state of the random
permutations generator. Pass an int for reproducible output across
multiple function calls.
See :term:`Glossary <random_state>`.
n_jobs : int, default=None
Number of CPUs to use during the cross validation.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
verbose : bool, default=False
Verbose mode when fitting the model.
Attributes
----------
coef_ : ndarray of shape (n_features,)
Coefficients of the regression model (median of distribution).
intercept_ : float
Estimated intercept of regression model.
breakdown_ : float
Approximated breakdown point.
n_iter_ : int
Number of iterations needed for the spatial median.
n_subpopulation_ : int
Number of combinations taken into account from 'n choose k', where n is
the number of samples and k is the number of subsamples.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
HuberRegressor : Linear regression model that is robust to outliers.
RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
References
----------
- Theil-Sen Estimators in a Multiple Linear Regression Model, 2009
Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
http://home.olemiss.edu/~xdang/papers/MTSE.pdf
Examples
--------
>>> from sklearn.linear_model import TheilSenRegressor
>>> from sklearn.datasets import make_regression
>>> X, y = make_regression(
... n_samples=200, n_features=2, noise=4.0, random_state=0)
>>> reg = TheilSenRegressor(random_state=0).fit(X, y)
>>> reg.score(X, y)
0.9884...
>>> reg.predict(X[:1,])
array([-31.5871...])
"""
_parameter_constraints: dict = {
"fit_intercept": ["boolean"],
"copy_X": ["boolean"],
# target_type should be Integral but can accept Real for backward compatibility
"max_subpopulation": [Interval(Real, 1, None, closed="left")],
"n_subsamples": [None, Integral],
"max_iter": [Interval(Integral, 0, None, closed="left")],
"tol": [Interval(Real, 0.0, None, closed="left")],
"random_state": ["random_state"],
"n_jobs": [None, Integral],
"verbose": ["verbose"],
}
def __init__(
self,
*,
fit_intercept=True,
copy_X=True,
max_subpopulation=1e4,
n_subsamples=None,
max_iter=300,
tol=1.0e-3,
random_state=None,
n_jobs=None,
verbose=False,
):
self.fit_intercept = fit_intercept
self.copy_X = copy_X
self.max_subpopulation = max_subpopulation
self.n_subsamples = n_subsamples
self.max_iter = max_iter
self.tol = tol
self.random_state = random_state
self.n_jobs = n_jobs
self.verbose = verbose
def _check_subparams(self, n_samples, n_features):
n_subsamples = self.n_subsamples
if self.fit_intercept:
n_dim = n_features + 1
else:
n_dim = n_features
if n_subsamples is not None:
if n_subsamples > n_samples:
raise ValueError(
"Invalid parameter since n_subsamples > "
"n_samples ({0} > {1}).".format(n_subsamples, n_samples)
)
if n_samples >= n_features:
if n_dim > n_subsamples:
plus_1 = "+1" if self.fit_intercept else ""
raise ValueError(
"Invalid parameter since n_features{0} "
"> n_subsamples ({1} > {2})."
"".format(plus_1, n_dim, n_subsamples)
)
else: # if n_samples < n_features
if n_subsamples != n_samples:
raise ValueError(
"Invalid parameter since n_subsamples != "
"n_samples ({0} != {1}) while n_samples "
"< n_features.".format(n_subsamples, n_samples)
)
else:
n_subsamples = min(n_dim, n_samples)
all_combinations = max(1, np.rint(binom(n_samples, n_subsamples)))
n_subpopulation = int(min(self.max_subpopulation, all_combinations))
return n_subsamples, n_subpopulation
def fit(self, X, y):
"""Fit linear model.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
Training data.
y : ndarray of shape (n_samples,)
Target values.
Returns
-------
self : returns an instance of self.
Fitted `TheilSenRegressor` estimator.
"""
self._validate_params()
random_state = check_random_state(self.random_state)
X, y = self._validate_data(X, y, y_numeric=True)
n_samples, n_features = X.shape
n_subsamples, self.n_subpopulation_ = self._check_subparams(
n_samples, n_features
)
self.breakdown_ = _breakdown_point(n_samples, n_subsamples)
if self.verbose:
print("Breakdown point: {0}".format(self.breakdown_))
print("Number of samples: {0}".format(n_samples))
tol_outliers = int(self.breakdown_ * n_samples)
print("Tolerable outliers: {0}".format(tol_outliers))
print("Number of subpopulations: {0}".format(self.n_subpopulation_))
# Determine indices of subpopulation
if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation:
indices = list(combinations(range(n_samples), n_subsamples))
else:
indices = [
random_state.choice(n_samples, size=n_subsamples, replace=False)
for _ in range(self.n_subpopulation_)
]
n_jobs = effective_n_jobs(self.n_jobs)
index_list = np.array_split(indices, n_jobs)
weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
delayed(_lstsq)(X, y, index_list[job], self.fit_intercept)
for job in range(n_jobs)
)
weights = np.vstack(weights)
self.n_iter_, coefs = _spatial_median(
weights, max_iter=self.max_iter, tol=self.tol
)
if self.fit_intercept:
self.intercept_ = coefs[0]
self.coef_ = coefs[1:]
else:
self.intercept_ = 0.0
self.coef_ = coefs
return self