Inzynierka/Lib/site-packages/sklearn/dummy.py

675 lines
23 KiB
Python
Raw Permalink Normal View History

2023-06-02 12:51:02 +02:00
# Author: Mathieu Blondel <mathieu@mblondel.org>
# Arnaud Joly <a.joly@ulg.ac.be>
# Maheshakya Wijewardena <maheshakya.10@cse.mrt.ac.lk>
# License: BSD 3 clause
import warnings
from numbers import Integral, Real
import numpy as np
import scipy.sparse as sp
from .base import BaseEstimator, ClassifierMixin, RegressorMixin
from .base import MultiOutputMixin
from .utils import check_random_state
from .utils._param_validation import StrOptions, Interval
from .utils.validation import _num_samples
from .utils.validation import check_array
from .utils.validation import check_consistent_length
from .utils.validation import check_is_fitted, _check_sample_weight
from .utils.random import _random_choice_csc
from .utils.stats import _weighted_percentile
from .utils.multiclass import class_distribution
class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
"""DummyClassifier makes predictions that ignore the input features.
This classifier serves as a simple baseline to compare against other more
complex classifiers.
The specific behavior of the baseline is selected with the `strategy`
parameter.
All strategies make predictions that ignore the input feature values passed
as the `X` argument to `fit` and `predict`. The predictions, however,
typically depend on values observed in the `y` parameter passed to `fit`.
Note that the "stratified" and "uniform" strategies lead to
non-deterministic predictions that can be rendered deterministic by setting
the `random_state` parameter if needed. The other strategies are naturally
deterministic and, once fit, always return the same constant prediction
for any value of `X`.
Read more in the :ref:`User Guide <dummy_estimators>`.
.. versionadded:: 0.13
Parameters
----------
strategy : {"most_frequent", "prior", "stratified", "uniform", \
"constant"}, default="prior"
Strategy to use to generate predictions.
* "most_frequent": the `predict` method always returns the most
frequent class label in the observed `y` argument passed to `fit`.
The `predict_proba` method returns the matching one-hot encoded
vector.
* "prior": the `predict` method always returns the most frequent
class label in the observed `y` argument passed to `fit` (like
"most_frequent"). ``predict_proba`` always returns the empirical
class distribution of `y` also known as the empirical class prior
distribution.
* "stratified": the `predict_proba` method randomly samples one-hot
vectors from a multinomial distribution parametrized by the empirical
class prior probabilities.
The `predict` method returns the class label which got probability
one in the one-hot vector of `predict_proba`.
Each sampled row of both methods is therefore independent and
identically distributed.
* "uniform": generates predictions uniformly at random from the list
of unique classes observed in `y`, i.e. each class has equal
probability.
* "constant": always predicts a constant label that is provided by
the user. This is useful for metrics that evaluate a non-majority
class.
.. versionchanged:: 0.24
The default value of `strategy` has changed to "prior" in version
0.24.
random_state : int, RandomState instance or None, default=None
Controls the randomness to generate the predictions when
``strategy='stratified'`` or ``strategy='uniform'``.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
constant : int or str or array-like of shape (n_outputs,), default=None
The explicit constant as predicted by the "constant" strategy. This
parameter is useful only for the "constant" strategy.
Attributes
----------
classes_ : ndarray of shape (n_classes,) or list of such arrays
Unique class labels observed in `y`. For multi-output classification
problems, this attribute is a list of arrays as each output has an
independent set of possible classes.
n_classes_ : int or list of int
Number of label for each output.
class_prior_ : ndarray of shape (n_classes,) or list of such arrays
Frequency of each class observed in `y`. For multioutput classification
problems, this is computed independently for each output.
n_outputs_ : int
Number of outputs.
sparse_output_ : bool
True if the array returned from predict is to be in sparse CSC format.
Is automatically set to True if the input `y` is passed in sparse
format.
See Also
--------
DummyRegressor : Regressor that makes predictions using simple rules.
Examples
--------
>>> import numpy as np
>>> from sklearn.dummy import DummyClassifier
>>> X = np.array([-1, 1, 1, 1])
>>> y = np.array([0, 1, 1, 1])
>>> dummy_clf = DummyClassifier(strategy="most_frequent")
>>> dummy_clf.fit(X, y)
DummyClassifier(strategy='most_frequent')
>>> dummy_clf.predict(X)
array([1, 1, 1, 1])
>>> dummy_clf.score(X, y)
0.75
"""
_parameter_constraints: dict = {
"strategy": [
StrOptions({"most_frequent", "prior", "stratified", "uniform", "constant"})
],
"random_state": ["random_state"],
"constant": [Integral, str, "array-like", None],
}
def __init__(self, *, strategy="prior", random_state=None, constant=None):
self.strategy = strategy
self.random_state = random_state
self.constant = constant
def fit(self, X, y, sample_weight=None):
"""Fit the baseline classifier.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
self : object
Returns the instance itself.
"""
self._validate_params()
self._strategy = self.strategy
if self._strategy == "uniform" and sp.issparse(y):
y = y.toarray()
warnings.warn(
"A local copy of the target data has been converted "
"to a numpy array. Predicting on sparse target data "
"with the uniform strategy would not save memory "
"and would be slower.",
UserWarning,
)
self.sparse_output_ = sp.issparse(y)
if not self.sparse_output_:
y = np.asarray(y)
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
self.n_outputs_ = y.shape[1]
check_consistent_length(X, y)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
if self._strategy == "constant":
if self.constant is None:
raise ValueError(
"Constant target value has to be specified "
"when the constant strategy is used."
)
else:
constant = np.reshape(np.atleast_1d(self.constant), (-1, 1))
if constant.shape[0] != self.n_outputs_:
raise ValueError(
"Constant target value should have shape (%d, 1)."
% self.n_outputs_
)
(self.classes_, self.n_classes_, self.class_prior_) = class_distribution(
y, sample_weight
)
if self._strategy == "constant":
for k in range(self.n_outputs_):
if not any(constant[k][0] == c for c in self.classes_[k]):
# Checking in case of constant strategy if the constant
# provided by the user is in y.
err_msg = (
"The constant target value must be present in "
"the training data. You provided constant={}. "
"Possible values are: {}.".format(
self.constant, list(self.classes_[k])
)
)
raise ValueError(err_msg)
if self.n_outputs_ == 1:
self.n_classes_ = self.n_classes_[0]
self.classes_ = self.classes_[0]
self.class_prior_ = self.class_prior_[0]
return self
def predict(self, X):
"""Perform classification on test vectors X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test data.
Returns
-------
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
Predicted target values for X.
"""
check_is_fitted(self)
# numpy random_state expects Python int and not long as size argument
# under Windows
n_samples = _num_samples(X)
rs = check_random_state(self.random_state)
n_classes_ = self.n_classes_
classes_ = self.classes_
class_prior_ = self.class_prior_
constant = self.constant
if self.n_outputs_ == 1:
# Get same type even for self.n_outputs_ == 1
n_classes_ = [n_classes_]
classes_ = [classes_]
class_prior_ = [class_prior_]
constant = [constant]
# Compute probability only once
if self._strategy == "stratified":
proba = self.predict_proba(X)
if self.n_outputs_ == 1:
proba = [proba]
if self.sparse_output_:
class_prob = None
if self._strategy in ("most_frequent", "prior"):
classes_ = [np.array([cp.argmax()]) for cp in class_prior_]
elif self._strategy == "stratified":
class_prob = class_prior_
elif self._strategy == "uniform":
raise ValueError(
"Sparse target prediction is not "
"supported with the uniform strategy"
)
elif self._strategy == "constant":
classes_ = [np.array([c]) for c in constant]
y = _random_choice_csc(n_samples, classes_, class_prob, self.random_state)
else:
if self._strategy in ("most_frequent", "prior"):
y = np.tile(
[
classes_[k][class_prior_[k].argmax()]
for k in range(self.n_outputs_)
],
[n_samples, 1],
)
elif self._strategy == "stratified":
y = np.vstack(
[
classes_[k][proba[k].argmax(axis=1)]
for k in range(self.n_outputs_)
]
).T
elif self._strategy == "uniform":
ret = [
classes_[k][rs.randint(n_classes_[k], size=n_samples)]
for k in range(self.n_outputs_)
]
y = np.vstack(ret).T
elif self._strategy == "constant":
y = np.tile(self.constant, (n_samples, 1))
if self.n_outputs_ == 1:
y = np.ravel(y)
return y
def predict_proba(self, X):
"""
Return probability estimates for the test vectors X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test data.
Returns
-------
P : ndarray of shape (n_samples, n_classes) or list of such arrays
Returns the probability of the sample for each class in
the model, where classes are ordered arithmetically, for each
output.
"""
check_is_fitted(self)
# numpy random_state expects Python int and not long as size argument
# under Windows
n_samples = _num_samples(X)
rs = check_random_state(self.random_state)
n_classes_ = self.n_classes_
classes_ = self.classes_
class_prior_ = self.class_prior_
constant = self.constant
if self.n_outputs_ == 1:
# Get same type even for self.n_outputs_ == 1
n_classes_ = [n_classes_]
classes_ = [classes_]
class_prior_ = [class_prior_]
constant = [constant]
P = []
for k in range(self.n_outputs_):
if self._strategy == "most_frequent":
ind = class_prior_[k].argmax()
out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
out[:, ind] = 1.0
elif self._strategy == "prior":
out = np.ones((n_samples, 1)) * class_prior_[k]
elif self._strategy == "stratified":
out = rs.multinomial(1, class_prior_[k], size=n_samples)
out = out.astype(np.float64)
elif self._strategy == "uniform":
out = np.ones((n_samples, n_classes_[k]), dtype=np.float64)
out /= n_classes_[k]
elif self._strategy == "constant":
ind = np.where(classes_[k] == constant[k])
out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
out[:, ind] = 1.0
P.append(out)
if self.n_outputs_ == 1:
P = P[0]
return P
def predict_log_proba(self, X):
"""
Return log probability estimates for the test vectors X.
Parameters
----------
X : {array-like, object with finite length or shape}
Training data.
Returns
-------
P : ndarray of shape (n_samples, n_classes) or list of such arrays
Returns the log probability of the sample for each class in
the model, where classes are ordered arithmetically for each
output.
"""
proba = self.predict_proba(X)
if self.n_outputs_ == 1:
return np.log(proba)
else:
return [np.log(p) for p in proba]
def _more_tags(self):
return {
"poor_score": True,
"no_validation": True,
"_xfail_checks": {
"check_methods_subset_invariance": "fails for the predict method",
"check_methods_sample_order_invariance": "fails for the predict method",
},
}
def score(self, X, y, sample_weight=None):
"""Return the mean accuracy on the given test data and labels.
In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.
Parameters
----------
X : None or array-like of shape (n_samples, n_features)
Test samples. Passing None as test samples gives the same result
as passing real test samples, since DummyClassifier
operates independently of the sampled observations.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
True labels for X.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
score : float
Mean accuracy of self.predict(X) w.r.t. y.
"""
if X is None:
X = np.zeros(shape=(len(y), 1))
return super().score(X, y, sample_weight)
class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
"""Regressor that makes predictions using simple rules.
This regressor is useful as a simple baseline to compare with other
(real) regressors. Do not use it for real problems.
Read more in the :ref:`User Guide <dummy_estimators>`.
.. versionadded:: 0.13
Parameters
----------
strategy : {"mean", "median", "quantile", "constant"}, default="mean"
Strategy to use to generate predictions.
* "mean": always predicts the mean of the training set
* "median": always predicts the median of the training set
* "quantile": always predicts a specified quantile of the training set,
provided with the quantile parameter.
* "constant": always predicts a constant value that is provided by
the user.
constant : int or float or array-like of shape (n_outputs,), default=None
The explicit constant as predicted by the "constant" strategy. This
parameter is useful only for the "constant" strategy.
quantile : float in [0.0, 1.0], default=None
The quantile to predict using the "quantile" strategy. A quantile of
0.5 corresponds to the median, while 0.0 to the minimum and 1.0 to the
maximum.
Attributes
----------
constant_ : ndarray of shape (1, n_outputs)
Mean or median or quantile of the training targets or constant value
given by the user.
n_outputs_ : int
Number of outputs.
See Also
--------
DummyClassifier: Classifier that makes predictions using simple rules.
Examples
--------
>>> import numpy as np
>>> from sklearn.dummy import DummyRegressor
>>> X = np.array([1.0, 2.0, 3.0, 4.0])
>>> y = np.array([2.0, 3.0, 5.0, 10.0])
>>> dummy_regr = DummyRegressor(strategy="mean")
>>> dummy_regr.fit(X, y)
DummyRegressor()
>>> dummy_regr.predict(X)
array([5., 5., 5., 5.])
>>> dummy_regr.score(X, y)
0.0
"""
_parameter_constraints: dict = {
"strategy": [StrOptions({"mean", "median", "quantile", "constant"})],
"quantile": [Interval(Real, 0.0, 1.0, closed="both"), None],
"constant": [
Interval(Real, None, None, closed="neither"),
"array-like",
None,
],
}
def __init__(self, *, strategy="mean", constant=None, quantile=None):
self.strategy = strategy
self.constant = constant
self.quantile = quantile
def fit(self, X, y, sample_weight=None):
"""Fit the random regressor.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
self : object
Fitted estimator.
"""
self._validate_params()
y = check_array(y, ensure_2d=False, input_name="y")
if len(y) == 0:
raise ValueError("y must not be empty.")
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
self.n_outputs_ = y.shape[1]
check_consistent_length(X, y, sample_weight)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
if self.strategy == "mean":
self.constant_ = np.average(y, axis=0, weights=sample_weight)
elif self.strategy == "median":
if sample_weight is None:
self.constant_ = np.median(y, axis=0)
else:
self.constant_ = [
_weighted_percentile(y[:, k], sample_weight, percentile=50.0)
for k in range(self.n_outputs_)
]
elif self.strategy == "quantile":
if self.quantile is None:
raise ValueError(
"When using `strategy='quantile', you have to specify the desired "
"quantile in the range [0, 1]."
)
percentile = self.quantile * 100.0
if sample_weight is None:
self.constant_ = np.percentile(y, axis=0, q=percentile)
else:
self.constant_ = [
_weighted_percentile(y[:, k], sample_weight, percentile=percentile)
for k in range(self.n_outputs_)
]
elif self.strategy == "constant":
if self.constant is None:
raise TypeError(
"Constant target value has to be specified "
"when the constant strategy is used."
)
self.constant_ = check_array(
self.constant,
accept_sparse=["csr", "csc", "coo"],
ensure_2d=False,
ensure_min_samples=0,
)
if self.n_outputs_ != 1 and self.constant_.shape[0] != y.shape[1]:
raise ValueError(
"Constant target value should have shape (%d, 1)." % y.shape[1]
)
self.constant_ = np.reshape(self.constant_, (1, -1))
return self
def predict(self, X, return_std=False):
"""Perform classification on test vectors X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test data.
return_std : bool, default=False
Whether to return the standard deviation of posterior prediction.
All zeros in this case.
.. versionadded:: 0.20
Returns
-------
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
Predicted target values for X.
y_std : array-like of shape (n_samples,) or (n_samples, n_outputs)
Standard deviation of predictive distribution of query points.
"""
check_is_fitted(self)
n_samples = _num_samples(X)
y = np.full(
(n_samples, self.n_outputs_),
self.constant_,
dtype=np.array(self.constant_).dtype,
)
y_std = np.zeros((n_samples, self.n_outputs_))
if self.n_outputs_ == 1:
y = np.ravel(y)
y_std = np.ravel(y_std)
return (y, y_std) if return_std else y
def _more_tags(self):
return {"poor_score": True, "no_validation": True}
def score(self, X, y, sample_weight=None):
"""Return the coefficient of determination R^2 of the prediction.
The coefficient R^2 is defined as `(1 - u/v)`, where `u` is the
residual sum of squares `((y_true - y_pred) ** 2).sum()` and `v` is the
total sum of squares `((y_true - y_true.mean()) ** 2).sum()`. The best
possible score is 1.0 and it can be negative (because the model can be
arbitrarily worse). A constant model that always predicts the expected
value of y, disregarding the input features, would get a R^2 score of
0.0.
Parameters
----------
X : None or array-like of shape (n_samples, n_features)
Test samples. Passing None as test samples gives the same result
as passing real test samples, since `DummyRegressor`
operates independently of the sampled observations.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
True values for X.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
score : float
R^2 of `self.predict(X)` w.r.t. y.
"""
if X is None:
X = np.zeros(shape=(len(y), 1))
return super().score(X, y, sample_weight)